From b7306fb8f9e10f6695ad63b5fce0679e00901677 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 13:46:44 +0300 Subject: [PATCH 01/40] Fix docstrings formatting --- scrapinghub/client/collections.py | 4 ++-- scrapinghub/client/frontiers.py | 8 ++++---- scrapinghub/client/jobs.py | 21 ++++++++++----------- scrapinghub/client/projects.py | 4 ++-- scrapinghub/client/spiders.py | 4 ++-- 5 files changed, 20 insertions(+), 21 deletions(-) diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index a2d5e22f..ddb37507 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -91,7 +91,7 @@ def list(self): :return: a list of collections where each collection is represented by a dictionary with ('name','type') fields. - :rtype: list[dict] + :rtype: list of dicts """ return list(self.iter()) @@ -171,7 +171,7 @@ def list(self, key=None, prefix=None, prefixcount=None, startts=None, :param requests_params: (optional) a dict with optional requests params. :param \*\*params: (optional) additional query params for the request. :return: a list of items where each item is represented with a dict. - :rtype: list[dict] + :rtype: list of dicts # FIXME there should be similar docstrings for iter/iter_raw_json # but as we proxy them as-is, it's not in place, should be improved diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index bf2b460b..9e6ca7b6 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -105,7 +105,7 @@ def list(self): """List frontiers names. :return: a list of frontiers names. - :rtype: list[str] + :rtype: list of strings """ return next(self._origin.apiget('list')) @@ -171,7 +171,7 @@ def list(self): """List all slots. :return: a list of frontier slots names. - :rtype: list[str] + :rtype: list of strings """ return next(self._frontiers._origin.apiget((self.key, 'list'))) @@ -315,7 +315,7 @@ def list(self, **params): :param \*\*params: (optional) additional query params for the request. :return: a list of fingerprints. - :rtype: list[str] + :rtype: list of strings """ return list(self.iter(**params)) @@ -353,7 +353,7 @@ def list(self, mincount=None, **params): :param \*\*params: (optional) additional query params for the request. :return: a list of request batches in the queue where each batch is represented with a dict with ('id', 'requests') field. - :rtype: list[dict] + :rtype: list of dicts """ return list(self.iter(mincount=mincount, **params)) diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index 43837216..62bc53f6 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -1,5 +1,4 @@ from __future__ import absolute_import -import json from ..hubstorage.job import JobMeta as _JobMeta from ..hubstorage.job import Items as _Items @@ -113,16 +112,16 @@ def iter(self, count=None, start=None, spider=None, state=None, >>> [job['key'] for job in jobs_summary] ['123/1/3', '123/1/2', '123/1/1'] - - job summary fieldset is less detailed than job.metadata but - contains few new fields as well. Additional fields can be requested - using ``meta`` parameter. If it's used, then it's up to the user - to list all the required fields, so only few default fields would - be added except requested ones:: + - job summary fieldset is less detailed than job.metadata but contains + few new fields as well. Additional fields can be requested using + ``meta`` parameter. If it's used, then it's up to the user to list + all the required fields, so only few default fields would be added + except requested ones:: >>> jobs_summary = project.jobs.iter(meta=['scheduled_by', ]) - by default :meth:`Jobs.iter` returns maximum last 1000 results. - Pagination is available using start parameter:: + Pagination is available using start parameter:: >>> jobs_summary = spider.jobs.iter(start=1000) @@ -165,7 +164,7 @@ def list(self, count=None, start=None, spider=None, state=None, :param \*\*params: (optional) other filter params. :return: list of dictionaries of jobs summary for a given filter params - :rtype: list[dict] + :rtype: list of dicts Please note that list() method can use a lot of memory and for a large amount of jobs it's recommended to iterate through it via iter() @@ -257,11 +256,11 @@ def summary(self, state=None, spider=None, **params): :param state: (optional) a string state to filter jobs. :param spider: (optional) a spider name - (not needed if instantiated with :cls:`Spider`). + (not needed if instantiated with :class:`Spider`). :param \*\*params: (optional) additional keyword args. :return: a list of dictionaries of jobs summary for a given filter params grouped by job state. - :rtype: list[dict] + :rtype: list of dicts Usage:: @@ -285,7 +284,7 @@ def iter_last(self, start=None, start_after=None, count=None, :param start_after: (optional) :param count: (optional) :param spider: (optional) a spider name - (not needed if instantiated with :cls:`Spider`). + (not needed if instantiated with :class:`Spider`). :param \*\*params: (optional) additional keyword args. :return: a generator object over a list of dictionaries of jobs summary for a given filter params. diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index d8fcbf9f..5e9e9bec 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -46,7 +46,7 @@ def list(self): """Get list of projects available to current user. :return: a list of project ids. - :rtype: list[int] + :rtype: list of integers Usage:: @@ -72,7 +72,7 @@ def summary(self, state=None, **params): :return: a list of dictionaries: each dictionary represents a project summary (amount of pending/running/finished jobs and a flag if it has a capacity to run new jobs). - :rtype: list[dict] + :rtype: list of dicts Usage:: diff --git a/scrapinghub/client/spiders.py b/scrapinghub/client/spiders.py index 522ecb05..9bfe302e 100644 --- a/scrapinghub/client/spiders.py +++ b/scrapinghub/client/spiders.py @@ -52,7 +52,7 @@ def list(self): """Get a list of spiders for a project. :return: a list of dictionaries with spiders metadata. - :rtype: list[dict] + :rtype: list of dicts Usage:: @@ -122,7 +122,7 @@ def list_tags(self): """List spider tags. :return: a list of spider tags. - :rtype: list[str] + :rtype: list of strings """ path = 'v2/projects/{}/spiders/{}'.format(self.project_id, self._id) url = urljoin(self._client._connection.url, path) From 45a582c113c1f5f6e93fdb641b6332ba442dfca0 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 13:47:51 +0300 Subject: [PATCH 02/40] Add apidocs documentation --- docs/Makefile | 25 +++++ docs/conf.py | 168 ++++++++++++++++++++++++++++ docs/index.rst | 20 ++++ docs/make.bat | 36 ++++++ docs/modules/modules.rst | 7 ++ docs/modules/scrapinghub.client.rst | 110 ++++++++++++++++++ docs/modules/scrapinghub.rst | 17 +++ 7 files changed, 383 insertions(+) create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 docs/modules/modules.rst create mode 100644 docs/modules/scrapinghub.client.rst create mode 100644 docs/modules/scrapinghub.rst diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..f775988e --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,25 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = python-scrapinghub +SPHINXAPIDOCS = sphinx-apidoc +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +apidocs: + # deprecated packages are not included to apidocs + @$(SPHINXAPIDOCS) -o docs/modules scrapinghub scrapinghub/legacy.py scrapinghub/hubstorage + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..76f57107 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +# +# python-scrapinghub documentation build configuration file, created by +# sphinx-quickstart on Fri Mar 24 12:28:40 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['sphinx.ext.autodoc'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'scrapinghub' +copyright = u'2017, Pablo Hoffman, Daniel Graña' +author = u'Pablo Hoffman, Daniel Graña' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = u'2.0.0.dev0' +# The full version, including alpha/beta/rc tags. +release = u'2.0.0.dev0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'python-scrapinghubdoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'python-scrapinghub.tex', u'python-scrapinghub Documentation', + u'Pablo Hoffman, Daniel Graña', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'python-scrapinghub', u'python-scrapinghub Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'python-scrapinghub', u'python-scrapinghub Documentation', + author, 'python-scrapinghub', 'One line description of project.', + 'Miscellaneous'), +] + +# Following is taken from https://github.com/snide/sphinx_rtd_theme# +# using-this-theme-locally-then-building-on-read-the-docs + +# on_rtd is whether we are on readthedocs.org, +# this line of code grabbed from docs.readthedocs.org + +on_rtd = os.environ.get('READTHEDOCS', None) == 'True' + +if not on_rtd: # only import and set the theme if we're building docs locally + import sphinx_rtd_theme + html_theme = 'sphinx_rtd_theme' + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# otherwise, readthedocs.org uses their theme by default, no need to specify it diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..4a8708f7 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,20 @@ +.. python-scrapinghub documentation master file, created by + sphinx-quickstart on Fri Mar 24 12:28:40 2017. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to python-scrapinghub's documentation! +============================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + modules/modules + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..f7dfb3e2 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=python-scrapinghub + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/modules/modules.rst b/docs/modules/modules.rst new file mode 100644 index 00000000..bb76eb58 --- /dev/null +++ b/docs/modules/modules.rst @@ -0,0 +1,7 @@ +scrapinghub +=========== + +.. toctree:: + :maxdepth: 4 + + scrapinghub diff --git a/docs/modules/scrapinghub.client.rst b/docs/modules/scrapinghub.client.rst new file mode 100644 index 00000000..2ed4b1dd --- /dev/null +++ b/docs/modules/scrapinghub.client.rst @@ -0,0 +1,110 @@ +scrapinghub.client package +========================== + +Submodules +---------- + +scrapinghub.client.activity module +---------------------------------- + +.. automodule:: scrapinghub.client.activity + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.collections module +------------------------------------- + +.. automodule:: scrapinghub.client.collections + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.exceptions module +------------------------------------ + +.. automodule:: scrapinghub.client.exceptions + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.frontiers module +----------------------------------- + +.. automodule:: scrapinghub.client.frontiers + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.items module +------------------------------- + +.. automodule:: scrapinghub.client.items + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.jobs module +------------------------------ + +.. automodule:: scrapinghub.client.jobs + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.logs module +------------------------------ + +.. automodule:: scrapinghub.client.logs + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.projects module +---------------------------------- + +.. automodule:: scrapinghub.client.projects + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.requests module +---------------------------------- + +.. automodule:: scrapinghub.client.requests + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.samples module +--------------------------------- + +.. automodule:: scrapinghub.client.samples + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.spiders module +--------------------------------- + +.. automodule:: scrapinghub.client.spiders + :members: + :undoc-members: + :show-inheritance: + +scrapinghub.client.utils module +------------------------------- + +.. automodule:: scrapinghub.client.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: scrapinghub.client + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/scrapinghub.rst b/docs/modules/scrapinghub.rst new file mode 100644 index 00000000..b8e9fb2e --- /dev/null +++ b/docs/modules/scrapinghub.rst @@ -0,0 +1,17 @@ +scrapinghub package +=================== + +Subpackages +----------- + +.. toctree:: + + scrapinghub.client + +Module contents +--------------- + +.. automodule:: scrapinghub + :members: + :undoc-members: + :show-inheritance: From f9299e63969bc51921e97c1ae879ba6ab15218df Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 13:54:40 +0300 Subject: [PATCH 03/40] Do not hardcode version in the docs --- docs/Makefile | 2 +- docs/conf.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index f775988e..823aeefa 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -16,7 +16,7 @@ help: .PHONY: help Makefile apidocs: - # deprecated packages are not included to apidocs + # deprecated packages are not included to apidocs @$(SPHINXAPIDOCS) -o docs/modules scrapinghub scrapinghub/legacy.py scrapinghub/hubstorage # Catch-all target: route all unknown targets to Sphinx using the new diff --git a/docs/conf.py b/docs/conf.py index 76f57107..432901b3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,6 +20,8 @@ import sys sys.path.insert(0, os.path.abspath('..')) +from scrapinghub import __version__ # noqa +VERSION = __version__.rsplit('.', 2)[0] # -- General configuration ------------------------------------------------ @@ -54,9 +56,9 @@ # built documents. # # The short X.Y version. -version = u'2.0.0.dev0' +version = VERSION # The full version, including alpha/beta/rc tags. -release = u'2.0.0.dev0' +release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From b60c7ed3929e3aca2f051e7cb114a3851b296bee Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 17:03:09 +0300 Subject: [PATCH 04/40] Fix class references in docstrings --- scrapinghub/client/frontiers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index 9e6ca7b6..dedad31a 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -88,7 +88,7 @@ def get(self, name): """Get a frontier by name. :param name: a frontier name string. - :return: class:`Frontier` instance. + :return: :class:`Frontier` instance. :rtype: Frontier """ return Frontier(self._client, self, name) @@ -154,7 +154,7 @@ def __init__(self, client, frontiers, name): def get(self, slot): """Get a slot by name. - :return: class:`FrontierSlot` instance. + :return: :class:`FrontierSlot` instance. :rtype: FrontierSlot """ return FrontierSlot(self._client, self, slot) @@ -249,7 +249,7 @@ def __init__(self, client, frontier, slot): def f(self): """Shortcut to have quick access to slot fingerprints. - :return: class:`FrontierSlotFingerprints` instance. + :return: :class:`FrontierSlotFingerprints` instance. :rtype: FrontierSlotFingerprints """ return self.fingerprints @@ -258,7 +258,7 @@ def f(self): def q(self): """Shortcut to have quick access to a slot queue. - :return: class:`FrontierSlotQueue` instance. + :return: :class:`FrontierSlotQueue` instance. :rtype: FrontierSlotQueue """ return self.queue From fda8d19c5487c84d9d2b85d4832578ee278b94ee Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 17:44:55 +0300 Subject: [PATCH 05/40] Move README to docs and split it --- .gitignore | 2 + README.rst | 956 +----------------- docs/Makefile | 4 - ...apinghub.client.rst => client_apidocs.rst} | 18 +- docs/index.rst | 59 +- docs/legacy_connection.rst | 106 ++ docs/legacy_hubstorage.rst | 244 +++++ docs/modules/modules.rst | 7 - docs/modules/scrapinghub.rst | 17 - docs/overview.rst | 585 +++++++++++ docs/testing.rst | 28 + 11 files changed, 1035 insertions(+), 991 deletions(-) rename docs/{modules/scrapinghub.client.rst => client_apidocs.rst} (100%) create mode 100644 docs/legacy_connection.rst create mode 100644 docs/legacy_hubstorage.rst delete mode 100644 docs/modules/modules.rst delete mode 100644 docs/modules/scrapinghub.rst create mode 100644 docs/overview.rst create mode 100644 docs/testing.rst diff --git a/.gitignore b/.gitignore index d9c603d5..50003f37 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ dist /.cache .coverage +# documentation +docs/_build diff --git a/README.rst b/README.rst index 4e597115..988d7489 100644 --- a/README.rst +++ b/README.rst @@ -1,17 +1,9 @@ -==================================== -Client interface for Scrapinghub API -==================================== - -.. image:: https://secure.travis-ci.org/scrapinghub/python-scrapinghub.png?branch=master - :target: http://travis-ci.org/scrapinghub/python-scrapinghub - +Scrapinghub command line client +=============================== The ``scrapinghub`` is a Python library for communicating with the `Scrapinghub API`_. -.. contents:: :depth: 2 - - Requirements ============ @@ -31,947 +23,11 @@ response time and improved bandwidth usage:: pip install scrapinghub[msgpack] -New client -========== - -The ``scrapinghub.ScrapinghubClient`` is a new Python client for communicating -with the `Scrapinghub API`_. It takes best from ``scrapinghub.Connection`` and -``scrapinghub.HubstorageClient`` and combines it under single interface. - -First, you instantiate new client:: - - >>> from scrapinghub import ScrapinghubClient - >>> client = ScrapinghubClient('APIKEY') - >>> client - - -Client instance has ``projects`` field for access to client projects. - -Projects --------- - -You can list the projects available to your account:: - - >>> client.projects.list() - [123, 456] - -Or check the projects summary:: - - >>> client.projects.summary() - [{'finished': 674, - 'has_capacity': True, - 'pending': 0, - 'project': 123, - 'running': 1}, - {'finished': 33079, - 'has_capacity': True, - 'pending': 0, - 'project': 456, - 'running': 2}] - -And select a particular project to work with:: - - >>> project = client.get_project(123) - >>> project - - >>> project.key - '123' - -The above is a shortcut for ``client.projects.get(123)``. - -Project -------- - -Project instance has ``jobs`` field to work with the project jobs. - -Jobs instance is described well in ``Jobs`` section below. - -For example, to schedule a spider run (it returns a job object):: - - >>> project.jobs.run('spider1', job_args={'arg1':'val1'}) - > - -Project instance also has the following fields: - -- activity - access to project activity records -- collections - work with project collections (see ``Collections`` section) -- frontiers - using project frontier (see ``Frontiers`` section) -- settings - interface to project settings -- spiders - access to spiders collection (see ``Spiders`` section) - - -Settings --------- - -To get a list of the project settings:: - - >>> project.settings.list() - [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - -To get a project setting value by name:: - - >>> project.settings.get('job_runtime_limit') - 24 - -To update a project setting value by name:: - - >>> project.settings.set('job_runtime_limit', 20) - -Or update a few project settings at once:: - - >>> project.settings.update({'default_job_units': 1, - ... 'job_runtime_limit': 20}) - - -Spiders -------- - -To get the list of spiders of the project:: - - >>> project.spiders.list() - [ - {'id': 'spider1', 'tags': [], 'type': 'manual', 'version': '123'}, - {'id': 'spider2', 'tags': [], 'type': 'manual', 'version': '123'} - ] - -To select a particular spider to work with:: - - >>> spider = project.spiders.get('spider2') - >>> spider - - >>> spider.key - '123/2' - >>> spider.name - spider2 - -Spider ------- - -Like project instance, spider instance has ``jobs`` field to work with the spider's jobs. - -To schedule a spider run:: - - >>> spider.jobs.run(job_args={'arg1:'val1'}) - > - -Note that you don't need to specify spider name explicitly. - -Jobs ----- - -Jobs collection is available on project/spider level. - -get -^^^ - -To select a specific job for a project:: - - >>> job = project.jobs.get('123/1/2') - >>> job.key - '123/1/2' - -Also there's a shortcut to get same job with client instance:: - - >>> job = client.get_job('123/1/2') - -run -^^^ - -Use ``run`` method to run a new job for project/spider:: - - >>> job = spider.jobs.run() - -Scheduling logic supports different options, like - -- spider_args to provide spider arguments for the job -- units to specify amount of units to run the job -- job_settings to pass additional settings for the job -- priority to set higher/lower priority of the job -- add_tag to create a job with a set of initial tags -- meta to pass additional custom metadata - -For example, to run a new job for a given spider with custom params:: - - >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, - priority=1, add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) - -Note that if you run a job on project level, spider name is required:: - - >>> job = project.jobs.run('spider1') - -count -^^^^^ - -It's also possible to count jobs for a given project/spider:: - - >>> spider.jobs.count() - 5 - -Count logic supports different filters, as described for `count endpoint`_. - - -iter -^^^^ - -To iterate through the spider jobs (descending order):: - - >>> jobs_summary = spider.jobs.iter() - >>> [j['key'] for j in jobs_summary] - ['123/1/3', '123/1/2', '123/1/1'] - -``jobs_summary`` is an iterator and, when iterated, returns an iterable -of dict objects, so you typically use it like this:: - - >>> for job in jobs_summary: - ... # do something with job data - -Or, if you just want to get the job ids:: - - >>> [x['key'] for x in jobs_summary] - ['123/1/3', '123/1/2', '123/1/1'] - -Job summary fieldset from ``iter()`` is less detailed than ``job.metadata``, -but contains few new fields as well. Additional fields can be requested using -the ``jobmeta`` parameter. If it used, then it's up to the user to list all the -required fields, so only few default fields would be added except requested -ones:: - - >>> job_summary = next(project.jobs.iter()) - >>> job_summary.get('spider', 'missing') - 'foo' - >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by', ]) - >>> job_summary = next(jobs_summary) - >>> job_summary.get('scheduled_by', 'missing') - 'John' - >>> job_summary.get('spider', 'missing') - missing - -By default ``jobs.iter()`` returns maximum last 1000 results. -Pagination is available using the ``start`` parameter:: - - >>> jobs_summary = spider.jobs.iter(start=1000) - -There are several filters like spider, state, has_tag, lacks_tag, -startts and endts (check `list endpoint`_ for more details). - -To get jobs filtered by tags:: - - >>> jobs_summary = project.jobs.iter(has_tag=['new', 'verified'], lacks_tag='obsolete') - -List of tags has ``OR`` power, so in the case above jobs with 'new' or -'verified' tag are expected. - -To get certain number of last finished jobs per some spider:: - - >>> jobs_summary = project.jobs.iter(spider='foo', state='finished', count=3) - -There are 4 possible job states, which can be used as values -for filtering by state: - -- pending -- running -- finished -- deleted - -Dict entries returned by ``iter`` method contain some additional meta, -but can be easily converted to ``Job`` instances with:: - - >>> [Job(x['key']) for x in jobs] - [ - , - , - , - ] - -summary -^^^^^^^ - -To check jobs summary:: - - >>> spider.jobs.summary() - [{'count': 0, 'name': 'pending', 'summary': []}, - {'count': 0, 'name': 'running', 'summary': []}, - {'count': 5, - 'name': 'finished', - 'summary': [...]} - -It's also possible to get last jobs summary (for each spider):: - - >>> list(sp.jobs.iter_last()) - [{'close_reason': 'success', - 'elapsed': 3062444, - 'errors': 1, - 'finished_time': 1482911633089, - 'key': '123/1/3', - 'logs': 8, - 'pending_time': 1482911596566, - 'running_time': 1482911598909, - 'spider': 'spider1', - 'state': 'finished', - 'ts': 1482911615830, - 'version': 'some-version'}] - -Note that there can be a lot of spiders, so the method above returns an iterator. - -Job ---- - -Job instance provides access to a job data with the following fields: - -- metadata -- items -- logs -- requests -- samples - -Request to cancel a job:: - - >>> job.cancel() - -To delete a job:: - - >>> job.delete() - -Metadata -^^^^^^^^ - -Job details can be found in jobs metadata and it's scrapystats:: - - >>> job.metadata.get('version') - '5123a86-master' - >>> job.metadata.get('scrapystats') - ... - 'downloader/response_count': 104, - 'downloader/response_status_count/200': 104, - 'finish_reason': 'finished', - 'finish_time': 1447160494937, - 'item_scraped_count': 50, - 'log_count/DEBUG': 157, - 'log_count/INFO': 1365, - 'log_count/WARNING': 3, - 'memusage/max': 182988800, - 'memusage/startup': 62439424, - ... - -Anything can be stored in metadata, here is example how to add tags:: - - >>> job.metadata.set('tags', ['obsolete']) - -Items -^^^^^ - -To retrieve all scraped items from a job:: - - >>> for item in job.items.iter(): - ... # do something with item (it's just a dict) - -Logs -^^^^ - -To retrieve all log entries from a job:: - - >>> for logitem in job.logs.iter(): - ... # logitem is a dict with level, message, time - >>> logitem - { - 'level': 20, - 'message': '[scrapy.core.engine] Closing spider (finished)', - 'time': 1482233733976}, - } - -Requests -^^^^^^^^ - -To retrieve all requests from a job:: - - >>> for reqitem in job.requests.iter(): - ... # reqitem is a dict - >>> reqitem - [{ - 'duration': 354, - 'fp': '6d748741a927b10454c83ac285b002cd239964ea', - 'method': 'GET', - 'rs': 1270, - 'status': 200, - 'time': 1482233733870, - 'url': 'https://example.com' - }] - -Samples -^^^^^^^ - -To retrieve all samples for a job:: - - >>> for sample in job.samples.iter(): - ... # sample is a list with a timestamp and data - >>> sample - [1482233732452, 0, 0, 0, 0, 0] - - -Activity --------- - -To retrieve all activity events from a project:: - - >>> project.activity.iter() - - - >>> project.activity.list() - [{'event': 'job:completed', 'job': '123/2/3', 'user': 'jobrunner'}, - {'event': 'job:cancelled', 'job': '123/2/3', 'user': 'john'}] - -To post a new activity event:: - - >>> event = {'event': 'job:completed', 'job': '123/2/4', 'user': 'john'} - >>> project.activity.add(event) - -Or post multiple events at once:: - - >>> events = [ - {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, - {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, - ] - >>> project.activity.add(events) - - -Collections ------------ - -As an example, let's store hash and timestamp pair for foo spider. - -Usual workflow with `Collections`_ would be:: - - >>> collections = project.collections - >>> foo_store = collections.get_store('foo_store') - >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) - >>> foo_store.count() - 1 - >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') - {u'value': u'1447221694537'} - >>> # iterate over _key & value pair - ... list(foo_store.iter()) - [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] - >>> # filter by multiple keys - only values for keys that exist will be returned - ... list(foo_store.iter(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])) - [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] - >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') - >>> foo_store.count() - 0 - -Collections are available on project level only. - -Frontiers ---------- - -Typical workflow with `Frontier`_:: - - >>> frontiers = project.frontiers - -Get all frontiers from a project to iterate through it:: - - >>> frontiers.iter() - - -List all frontiers:: - - >>> frontiers.list() - ['test', 'test1', 'test2'] - -Get a frontier by name:: - - >>> frontier = frontiers.get('test') - >>> frontier - - -Get an iterator to iterate through a frontier slots:: - - >>> frontier.iter() - - -List all slots:: - - >>> frontier.list() - ['example.com', 'example.com2'] - -Get a frontier slot by name:: - - >>> slot = frontier.get('example.com') - >>> slot - - -Add a request to the slot:: - - >>> slot.queue.add([{'fp': '/some/path.html'}]) - >>> slot.flush() - >>> slot.newcount - 1 - -``newcount`` is defined per slot, but also available per frontier and globally:: - - >>> frontier.newcount - 1 - >>> frontiers.newcount - 3 - -Add a fingerprint only to the slot:: - - >>> slot.fingerprints.add(['fp1', 'fp2']) - >>> slot.flush() - -There are convenient shortcuts: ``f`` for ``fingerprints`` and ``q`` for ``queue``. - -Add requests with additional parameters:: - - >>> slot.q.add([{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) - >>> slot.flush() - -To retrieve all requests for a given slot:: - - >>> reqs = slot.q.iter() - -To retrieve all fingerprints for a given slot:: - - >>> fps = slot.f.iter() - -To list all the requests use ``list()`` method (similar for ``fingerprints``):: - - >>> fps = slot.q.list() - -To delete a batch of requests:: - - >>> slot.q.delete('00013967d8af7b0001') - -To delete the whole slot from the frontier:: - - >>> slot.delete() - -Flush data of the given frontier:: - - >>> frontier.flush() - -Flush data of all frontiers of a project:: - - >>> frontiers.flush() - -Close batch writers of all frontiers of a project:: - - >>> frontiers.close() - -Frontiers are available on project level only. - -Tags ----- - -Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). - -To mark a job with tag ``consumed``:: - - >>> job.update_tags(add=['consumed']) - -To mark all spider jobs with tag ``consumed``:: - - >>> spider.jobs.update_tags(add=['consumed']) - -To remove existing tag ``existing`` for all spider jobs:: - - >>> spider.jobs.update_tags(remove=['existing']) - -Modifying tags is available on spider/job levels. - - -Exceptions ----------- - -scrapinghub.exceptions.ScrapinghubAPIError -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Base exception class. - - -scrapinghub.exceptions.InvalidUsage -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Usually raised in case of 400 response from API. - - -scrapinghub.exceptions.NotFound -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Entity doesn't exist (e.g. spider or project). - - -scrapinghub.exceptions.ValueTooLarge -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Value cannot be writtent because it exceeds size limits. - -scrapinghub.exceptions.DuplicateJobError -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Job for given spider with given arguments is already scheduled or running. - - - - -Legacy client -============= - -First, you connect to Scrapinghub:: - - >>> from scrapinghub import Connection - >>> conn = Connection('APIKEY') - >>> conn - Connection('APIKEY') - -You can list the projects available to your account:: - - >>> conn.project_ids() - [123, 456] - -And select a particular project to work with:: - - >>> project = conn[123] - >>> project - Project(Connection('APIKEY'), 123) - >>> project.id - 123 - -To schedule a spider run (it returns the job id):: - - >>> project.schedule('myspider', arg1='val1') - u'123/1/1' - -To get the list of spiders in the project:: - - >>> project.spiders() - [ - {u'id': u'spider1', u'tags': [], u'type': u'manual', u'version': u'123'}, - {u'id': u'spider2', u'tags': [], u'type': u'manual', u'version': u'123'} - ] - -To get all finished jobs:: - - >>> jobs = project.jobs(state='finished') - -``jobs`` is a ``JobSet``. ``JobSet`` objects are iterable and, when iterated, -return an iterable of ``Job`` objects, so you typically use it like this:: - - >>> for job in jobs: - ... # do something with job - -Or, if you just want to get the job ids:: - - >>> [x.id for x in jobs] - [u'123/1/1', u'123/1/2', u'123/1/3'] - -To select a specific job:: - - >>> job = project.job(u'123/1/2') - >>> job.id - u'123/1/2' - -To retrieve all scraped items from a job:: - - >>> for item in job.items(): - ... # do something with item (it's just a dict) - -To retrieve all log entries from a job:: - - >>> for logitem in job.log(): - ... # logitem is a dict with logLevel, message, time - -To get job info:: - - >>> job.info['spider'] - 'myspider' - >>> job.info['started_time'] - '2010-09-28T15:09:57.629000' - >>> job.info['tags'] - [] - >>> job.info['fields_count]['description'] - 1253 - -To mark a job with tag ``consumed``:: - - >>> job.update(add_tag='consumed') - -To mark several jobs with tag ``consumed`` (``JobSet`` also supports the -``update()`` method):: - - >>> project.jobs(state='finished').update(add_tag='consumed') - -To delete a job:: - - >>> job.delete() - -To delete several jobs (``JobSet`` also supports the ``update()`` method):: - - >>> project.jobs(state='finished').delete() - - -Legacy Hubstorage client -======================== - -The library can also be used for interaction with spiders, jobs and scraped data through ``storage.scrapinghub.com`` endpoints. - -First, use your API key for authorization:: - - >>> from scrapinghub import HubstorageClient - >>> hc = HubstorageClient(auth='apikey') - >>> hc.server_timestamp() - 1446222762611 - -Project -------- - -To get project settings or jobs summary:: - - >>> project = hc.get_project('1111111') - >>> project.settings['botgroups'] - [u'botgroup1', ] - >>> project.jobsummary() - {u'finished': 6, - u'has_capacity': True, - u'pending': 0, - u'project': 1111111, - u'running': 0} - -Spider ------- - -To get spider id correlated with its name:: - - >>> project.ids.spider('foo') - 1 - -To see last jobs summaries:: - - >>> summaries = project.spiders.lastjobsummary(count=3) - -To get job summary per spider:: - - >>> summary = project.spiders.lastjobsummary(spiderid='1') - -Job ---- - -Job can be **retrieved** directly by id (project_id/spider_id/job_id):: - - >>> job = hc.get_job('1111111/1/1') - >>> job.key - '1111111/1/1' - >>> job.metadata['state'] - u'finished' - -**Creating** a new job requires a spider name:: - - >>> job = hc.push_job(projectid='1111111', spidername='foo') - >>> job.key - '1111111/1/1' - -Priority can be between 0 and 4 (from lowest to highest), the default is 2. - -To push job from project level with the highest priority:: - - >>> job = project.push_job(spidername='foo', priority=4) - >>> job.metadata['priority'] - 4 - -Pushing a job with spider arguments:: - - >>> project.push_job(spidername='foo', spider_args={'arg1': 'foo', 'arg2': 'bar'}) - -Running job can be **cancelled** by calling ``request_cancel()``:: - - >>> job.request_cancel() - >>> job.metadata['cancelled_by'] - u'John' - -To **delete** job:: - - >>> job.purged() - >>> job.metadata['state'] - u'deleted' - -Job details ------------ - -Job details can be found in jobs metadata and it's scrapystats:: - - >>> job = hc.get_job('1111111/1/1') - >>> job.metadata['version'] - u'5123a86-master' - >>> job.metadata['scrapystats'] - ... - u'downloader/response_count': 104, - u'downloader/response_status_count/200': 104, - u'finish_reason': u'finished', - u'finish_time': 1447160494937, - u'item_scraped_count': 50, - u'log_count/DEBUG': 157, - u'log_count/INFO': 1365, - u'log_count/WARNING': 3, - u'memusage/max': 182988800, - u'memusage/startup': 62439424, - ... - -Anything can be stored in metadata, here is example how to add tags:: - - >>> job.update_metadata({'tags': 'obsolete'}) - -Jobs ----- - -To iterate through all jobs metadata per project (descending order):: - - >>> jobs_metadata = project.jobq.list() - >>> [j['key'] for j in jobs_metadata] - ['1111111/1/3', '1111111/1/2', '1111111/1/1'] - -Jobq metadata fieldset is less detailed, than ``job.metadata``, but contains few new fields as well. -Additional fields can be requested using the ``jobmeta`` parameter. -If it used, then it's up to the user to list all the required fields, so only few default fields would be added except requested ones:: - - >>> metadata = next(project.jobq.list()) - >>> metadata.get('spider', 'missing') - u'foo' - >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by', ]) - >>> metadata = next(jobs_metadata) - >>> metadata.get('scheduled_by', 'missing') - u'John' - >>> metadata.get('spider', 'missing') - missing - -By default ``jobq.list()`` returns maximum last 1000 results. Pagination is available using the ``start`` parameter:: - - >>> jobs_metadata = project.jobq.list(start=1000) - -There are several filters like spider, state, has_tag, lacks_tag, startts and endts. -To get jobs filtered by tags:: - - >>> jobs_metadata = project.jobq.list(has_tag=['new', 'verified'], lacks_tag='obsolete') - -List of tags has ``OR`` power, so in the case above jobs with 'new' or 'verified' tag are expected. - -To get certain number of last finished jobs per some spider:: - - >>> jobs_metadata = project.jobq.list(spider='foo', state='finished' count=3) - -There are 4 possible job states, which can be used as values for filtering by state: - -- pending -- running -- finished -- deleted - - -Items ------ - -To iterate through items:: - - >>> items = job.items.iter_values() - >>> for item in items: - # do something, item is just a dict - -Logs ----- - -To iterate through 10 first logs for example:: - - >>> logs = job.logs.iter_values(count=10) - >>> for log in logs: - # do something, log is a dict with log level, message and time keys - -Collections ------------ - -Let's store hash and timestamp pair for foo spider. Usual workflow with `Collections`_ would be:: - - >>> collections = project.collections - >>> foo_store = collections.new_store('foo_store') - >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) - >>> foo_store.count() - 1 - >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') - {u'value': u'1447221694537'} - >>> # iterate over _key & value pair - ... list(foo_store.iter_values()) - [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] - >>> # filter by multiple keys - only values for keys that exist will be returned - ... list(foo_store.iter_values(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])) - [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] - >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') - >>> foo_store.count() - 0 - -Frontier --------- - -Typical workflow with `Frontier`_:: - - >>> frontier = project.frontier - -Add a request to the frontier:: - - >>> frontier.add('test', 'example.com', [{'fp': '/some/path.html'}]) - >>> frontier.flush() - >>> frontier.newcount - 1 - -Add requests with additional parameters:: - - >>> frontier.add('test', 'example.com', [{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) - >>> frontier.flush() - >>> frontier.newcount - 2 - -To delete the slot ``example.com`` from the frontier:: - - >>> frontier.delete_slot('test', 'example.com') - -To retrieve requests for a given slot:: - - >>> reqs = frontier.read('test', 'example.com') - -To delete a batch of requests:: - - >>> frontier.delete('test', 'example.com', '00013967d8af7b0001') - -To retrieve fingerprints for a given slot:: - - >>> fps = [req['requests'] for req in frontier.read('test', 'example.com')] - -Tests -===== - -The package is covered with integration tests based on `VCR.py library`_: there -are recorded cassettes files in ``tests/*/cassettes`` used instead of HTTP -requests to real services, it helps to simplify and speed up development. - -By default, tests use VCR.py ``once`` mode to: - -- replay previously recorded interactions. -- record new interactions if there is no cassette file. -- cause an error to be raised for new requests if there is a cassette file. - -It means that if you add new integration tests and run all tests as usual, -only new cassettes will be created, all existing cassettes will stay unmodified. - -To ignore existing cassettes and use real service, please provide a flag:: - - py.test --ignore-cassettes - -If you want to update/recreate all the cassettes from scratch, please use:: - - py.test --update-cassettes +Documentation +------------- -Note that internally the above command erases the whole folder with cassettes. +Documentation is available online via Read the Docs: +https://python-scrapinghub.readthedocs.io/, or in the ``docs`` directory. .. _Scrapinghub API: http://doc.scrapinghub.com/api.html -.. _Collections: http://doc.scrapinghub.com/api/collections.html -.. _Frontier: http://doc.scrapinghub.com/api/frontier.html -.. _VCR.py library: https://pypi.python.org/pypi/vcrpy -.. _count endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-count -.. _list endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-list diff --git a/docs/Makefile b/docs/Makefile index 823aeefa..c44acee0 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -15,10 +15,6 @@ help: .PHONY: help Makefile -apidocs: - # deprecated packages are not included to apidocs - @$(SPHINXAPIDOCS) -o docs/modules scrapinghub scrapinghub/legacy.py scrapinghub/hubstorage - # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile diff --git a/docs/modules/scrapinghub.client.rst b/docs/client_apidocs.rst similarity index 100% rename from docs/modules/scrapinghub.client.rst rename to docs/client_apidocs.rst index 2ed4b1dd..9ce0c8de 100644 --- a/docs/modules/scrapinghub.client.rst +++ b/docs/client_apidocs.rst @@ -1,6 +1,15 @@ scrapinghub.client package ========================== +Module contents +--------------- + +.. automodule:: scrapinghub.client + :members: + :undoc-members: + :show-inheritance: + + Submodules ---------- @@ -99,12 +108,3 @@ scrapinghub.client.utils module :members: :undoc-members: :show-inheritance: - - -Module contents ---------------- - -.. automodule:: scrapinghub.client - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/index.rst b/docs/index.rst index 4a8708f7..0d43899d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,14 +3,65 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to python-scrapinghub's documentation! -============================================== +==================================== +Client interface for Scrapinghub API +==================================== + +.. image:: https://secure.travis-ci.org/scrapinghub/python-scrapinghub.png?branch=master + :target: http://travis-ci.org/scrapinghub/python-scrapinghub + +The ``scrapinghub`` is a Python library for communicating with the `Scrapinghub API`_. + + +.. _Scrapinghub API: http://doc.scrapinghub.com/api.html + +Requirements +============ + +* Python 2.7 or above + + +Installation +============ + +The quick way:: + + pip install scrapinghub + +You can also install the library with MessagePack support, it provides better +response time and improved bandwidth usage:: + + pip install scrapinghub[msgpack] + + +ScrapinghubClient +================= .. toctree:: :maxdepth: 2 - :caption: Contents: - modules/modules + overview + client_apidocs + + +Legacy clients +============== + +.. toctree:: + :maxdepth: 2 + + legacy_connection + legacy_hubstorage + + +Tests +===== + +.. toctree:: + :maxdepth: 2 + + testing + Indices and tables ================== diff --git a/docs/legacy_connection.rst b/docs/legacy_connection.rst new file mode 100644 index 00000000..e46a6a7c --- /dev/null +++ b/docs/legacy_connection.rst @@ -0,0 +1,106 @@ +scrapinghub.legacy.Connection +============================= + +TODO add short description & deprecation message here! + + +First, you connect to Scrapinghub:: + + >>> from scrapinghub import Connection + >>> conn = Connection('APIKEY') + >>> conn + Connection('APIKEY') + +You can list the projects available to your account:: + + >>> conn.project_ids() + [123, 456] + +And select a particular project to work with:: + + >>> project = conn[123] + >>> project + Project(Connection('APIKEY'), 123) + >>> project.id + 123 + +To schedule a spider run (it returns the job id):: + + >>> project.schedule('myspider', arg1='val1') + u'123/1/1' + +To get the list of spiders in the project:: + + >>> project.spiders() + [ + {u'id': u'spider1', u'tags': [], u'type': u'manual', u'version': u'123'}, + {u'id': u'spider2', u'tags': [], u'type': u'manual', u'version': u'123'} + ] + +To get all finished jobs:: + + >>> jobs = project.jobs(state='finished') + +``jobs`` is a ``JobSet``. ``JobSet`` objects are iterable and, when iterated, +return an iterable of ``Job`` objects, so you typically use it like this:: + + >>> for job in jobs: + ... # do something with job + +Or, if you just want to get the job ids:: + + >>> [x.id for x in jobs] + [u'123/1/1', u'123/1/2', u'123/1/3'] + +To select a specific job:: + + >>> job = project.job(u'123/1/2') + >>> job.id + u'123/1/2' + +To retrieve all scraped items from a job:: + + >>> for item in job.items(): + ... # do something with item (it's just a dict) + +To retrieve all log entries from a job:: + + >>> for logitem in job.log(): + ... # logitem is a dict with logLevel, message, time + +To get job info:: + + >>> job.info['spider'] + 'myspider' + >>> job.info['started_time'] + '2010-09-28T15:09:57.629000' + >>> job.info['tags'] + [] + >>> job.info['fields_count]['description'] + 1253 + +To mark a job with tag ``consumed``:: + + >>> job.update(add_tag='consumed') + +To mark several jobs with tag ``consumed`` (``JobSet`` also supports the +``update()`` method):: + + >>> project.jobs(state='finished').update(add_tag='consumed') + +To delete a job:: + + >>> job.delete() + +To delete several jobs (``JobSet`` also supports the ``update()`` method):: + + >>> project.jobs(state='finished').delete() + + +Module contents +--------------- + +.. automodule:: scrapinghub.legacy + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/legacy_hubstorage.rst b/docs/legacy_hubstorage.rst new file mode 100644 index 00000000..3a3fe028 --- /dev/null +++ b/docs/legacy_hubstorage.rst @@ -0,0 +1,244 @@ +scrapinghub.hubstorage.HubstorageClient +======================================= + +TODO add short description & deprecation message here! + + +The library can also be used for interaction with spiders, jobs and scraped data through ``storage.scrapinghub.com`` endpoints. + +First, use your API key for authorization:: + + >>> from scrapinghub import HubstorageClient + >>> hc = HubstorageClient(auth='apikey') + >>> hc.server_timestamp() + 1446222762611 + +Project +------- + +To get project settings or jobs summary:: + + >>> project = hc.get_project('1111111') + >>> project.settings['botgroups'] + [u'botgroup1', ] + >>> project.jobsummary() + {u'finished': 6, + u'has_capacity': True, + u'pending': 0, + u'project': 1111111, + u'running': 0} + +Spider +------ + +To get spider id correlated with its name:: + + >>> project.ids.spider('foo') + 1 + +To see last jobs summaries:: + + >>> summaries = project.spiders.lastjobsummary(count=3) + +To get job summary per spider:: + + >>> summary = project.spiders.lastjobsummary(spiderid='1') + +Job +--- + +Job can be **retrieved** directly by id (project_id/spider_id/job_id):: + + >>> job = hc.get_job('1111111/1/1') + >>> job.key + '1111111/1/1' + >>> job.metadata['state'] + u'finished' + +**Creating** a new job requires a spider name:: + + >>> job = hc.push_job(projectid='1111111', spidername='foo') + >>> job.key + '1111111/1/1' + +Priority can be between 0 and 4 (from lowest to highest), the default is 2. + +To push job from project level with the highest priority:: + + >>> job = project.push_job(spidername='foo', priority=4) + >>> job.metadata['priority'] + 4 + +Pushing a job with spider arguments:: + + >>> project.push_job(spidername='foo', spider_args={'arg1': 'foo', 'arg2': 'bar'}) + +Running job can be **cancelled** by calling ``request_cancel()``:: + + >>> job.request_cancel() + >>> job.metadata['cancelled_by'] + u'John' + +To **delete** job:: + + >>> job.purged() + >>> job.metadata['state'] + u'deleted' + +Job details +----------- + +Job details can be found in jobs metadata and it's scrapystats:: + + >>> job = hc.get_job('1111111/1/1') + >>> job.metadata['version'] + u'5123a86-master' + >>> job.metadata['scrapystats'] + ... + u'downloader/response_count': 104, + u'downloader/response_status_count/200': 104, + u'finish_reason': u'finished', + u'finish_time': 1447160494937, + u'item_scraped_count': 50, + u'log_count/DEBUG': 157, + u'log_count/INFO': 1365, + u'log_count/WARNING': 3, + u'memusage/max': 182988800, + u'memusage/startup': 62439424, + ... + +Anything can be stored in metadata, here is example how to add tags:: + + >>> job.update_metadata({'tags': 'obsolete'}) + +Jobs +---- + +To iterate through all jobs metadata per project (descending order):: + + >>> jobs_metadata = project.jobq.list() + >>> [j['key'] for j in jobs_metadata] + ['1111111/1/3', '1111111/1/2', '1111111/1/1'] + +Jobq metadata fieldset is less detailed, than ``job.metadata``, but contains few new fields as well. +Additional fields can be requested using the ``jobmeta`` parameter. +If it used, then it's up to the user to list all the required fields, so only few default fields would be added except requested ones:: + + >>> metadata = next(project.jobq.list()) + >>> metadata.get('spider', 'missing') + u'foo' + >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by', ]) + >>> metadata = next(jobs_metadata) + >>> metadata.get('scheduled_by', 'missing') + u'John' + >>> metadata.get('spider', 'missing') + missing + +By default ``jobq.list()`` returns maximum last 1000 results. Pagination is available using the ``start`` parameter:: + + >>> jobs_metadata = project.jobq.list(start=1000) + +There are several filters like spider, state, has_tag, lacks_tag, startts and endts. +To get jobs filtered by tags:: + + >>> jobs_metadata = project.jobq.list(has_tag=['new', 'verified'], lacks_tag='obsolete') + +List of tags has ``OR`` power, so in the case above jobs with 'new' or 'verified' tag are expected. + +To get certain number of last finished jobs per some spider:: + + >>> jobs_metadata = project.jobq.list(spider='foo', state='finished' count=3) + +There are 4 possible job states, which can be used as values for filtering by state: + +- pending +- running +- finished +- deleted + + +Items +----- + +To iterate through items:: + + >>> items = job.items.iter_values() + >>> for item in items: + # do something, item is just a dict + +Logs +---- + +To iterate through 10 first logs for example:: + + >>> logs = job.logs.iter_values(count=10) + >>> for log in logs: + # do something, log is a dict with log level, message and time keys + +Collections +----------- + +Let's store hash and timestamp pair for foo spider. Usual workflow with `Collections`_ would be:: + + >>> collections = project.collections + >>> foo_store = collections.new_store('foo_store') + >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) + >>> foo_store.count() + 1 + >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') + {u'value': u'1447221694537'} + >>> # iterate over _key & value pair + ... list(foo_store.iter_values()) + [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] + >>> # filter by multiple keys - only values for keys that exist will be returned + ... list(foo_store.iter_values(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])) + [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] + >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') + >>> foo_store.count() + 0 + +Frontier +-------- + +Typical workflow with `Frontier`_:: + + >>> frontier = project.frontier + +Add a request to the frontier:: + + >>> frontier.add('test', 'example.com', [{'fp': '/some/path.html'}]) + >>> frontier.flush() + >>> frontier.newcount + 1 + +Add requests with additional parameters:: + + >>> frontier.add('test', 'example.com', [{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) + >>> frontier.flush() + >>> frontier.newcount + 2 + +To delete the slot ``example.com`` from the frontier:: + + >>> frontier.delete_slot('test', 'example.com') + +To retrieve requests for a given slot:: + + >>> reqs = frontier.read('test', 'example.com') + +To delete a batch of requests:: + + >>> frontier.delete('test', 'example.com', '00013967d8af7b0001') + +To retrieve fingerprints for a given slot:: + + >>> fps = [req['requests'] for req in frontier.read('test', 'example.com')] + + +Module contents +--------------- + +.. automodule:: scrapinghub.hubstorage + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/modules.rst b/docs/modules/modules.rst deleted file mode 100644 index bb76eb58..00000000 --- a/docs/modules/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -scrapinghub -=========== - -.. toctree:: - :maxdepth: 4 - - scrapinghub diff --git a/docs/modules/scrapinghub.rst b/docs/modules/scrapinghub.rst deleted file mode 100644 index b8e9fb2e..00000000 --- a/docs/modules/scrapinghub.rst +++ /dev/null @@ -1,17 +0,0 @@ -scrapinghub package -=================== - -Subpackages ------------ - -.. toctree:: - - scrapinghub.client - -Module contents ---------------- - -.. automodule:: scrapinghub - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/overview.rst b/docs/overview.rst new file mode 100644 index 00000000..e5c555a1 --- /dev/null +++ b/docs/overview.rst @@ -0,0 +1,585 @@ +ScrapinghubClient overview +========================== + +The ``scrapinghub.ScrapinghubClient`` is a new Python client for communicating +with the `Scrapinghub API`_. It takes best from ``scrapinghub.Connection`` and +``scrapinghub.HubstorageClient`` and combines it under single interface. + +First, you instantiate new client:: + + >>> from scrapinghub import ScrapinghubClient + >>> client = ScrapinghubClient('APIKEY') + >>> client + + +Client instance has ``projects`` field for access to client projects. + +Projects +-------- + +You can list the projects available to your account:: + + >>> client.projects.list() + [123, 456] + +Or check the projects summary:: + + >>> client.projects.summary() + [{'finished': 674, + 'has_capacity': True, + 'pending': 0, + 'project': 123, + 'running': 1}, + {'finished': 33079, + 'has_capacity': True, + 'pending': 0, + 'project': 456, + 'running': 2}] + +And select a particular project to work with:: + + >>> project = client.get_project(123) + >>> project + + >>> project.key + '123' + +The above is a shortcut for ``client.projects.get(123)``. + +Project +------- + +Project instance has ``jobs`` field to work with the project jobs. + +Jobs instance is described well in ``Jobs`` section below. + +For example, to schedule a spider run (it returns a job object):: + + >>> project.jobs.schedule('spider1', job_args={'arg1':'val1'}) + > + +Project instance also has the following fields: + +- activity - access to project activity records +- collections - work with project collections (see ``Collections`` section) +- frontiers - using project frontier (see ``Frontiers`` section) +- settings - interface to project settings +- spiders - access to spiders collection (see ``Spiders`` section) + + +Settings +-------- + +To get a list of the project settings:: + + >>> project.settings.list() + [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] + +To get a project setting value by name:: + + >>> project.settings.get('job_runtime_limit') + 24 + +To update a project setting value by name:: + + >>> project.settings.set('job_runtime_limit', 20) + +Or update a few project settings at once:: + + >>> project.settings.update({'default_job_units': 1, + ... 'job_runtime_limit': 20}) + + +Spiders +------- + +To get the list of spiders of the project:: + + >>> project.spiders.list() + [ + {'id': 'spider1', 'tags': [], 'type': 'manual', 'version': '123'}, + {'id': 'spider2', 'tags': [], 'type': 'manual', 'version': '123'} + ] + +To select a particular spider to work with:: + + >>> spider = project.spiders.get('spider2') + >>> spider + + >>> spider.key + '123/2' + >>> spider.name + spider2 + +Spider +------ + +Like project instance, spider instance has ``jobs`` field to work with the spider's jobs. + +To schedule a spider run:: + + >>> spider.jobs.schedule(job_args={'arg1:'val1'}) + > + +Note that you don't need to specify spider name explicitly. + +Jobs +---- + +Jobs collection is available on project/spider level. + +get +^^^ + +To select a specific job for a project:: + + >>> job = project.jobs.get('123/1/2') + >>> job.key + '123/1/2' + +Also there's a shortcut to get same job with client instance:: + + >>> job = client.get_job('123/1/2') + +schedule +^^^^^^^^ + +Use ``schedule`` method to schedule a new job for project/spider:: + + >>> job = spider.jobs.schedule() + +Scheduling logic supports different options, like + +- spider_args to provide spider arguments for the job +- units to specify amount of units to schedule the job +- job_settings to pass additional settings for the job +- priority to set higher/lower priority of the job +- add_tag to create a job with a set of initial tags +- meta to pass additional custom metadata + +For example, to schedule a new job for a given spider with custom params:: + + >>> job = spider.jobs.schedule(units=2, job_settings={'SETTING': 'VALUE'}, + priority=1, add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) + +Note that if you schedule a job on project level, spider name is required:: + + >>> job = project.jobs.schedule('spider1') + +count +^^^^^ + +It's also possible to count jobs for a given project/spider:: + + >>> spider.jobs.count() + 5 + +Count logic supports different filters, as described for `count endpoint`_. + + +iter +^^^^ + +To iterate through the spider jobs (descending order):: + + >>> jobs_summary = spider.jobs.iter() + >>> [j['key'] for j in jobs_summary] + ['123/1/3', '123/1/2', '123/1/1'] + +``jobs_summary`` is an iterator and, when iterated, returns an iterable +of dict objects, so you typically use it like this:: + + >>> for job in jobs_summary: + ... # do something with job data + +Or, if you just want to get the job ids:: + + >>> [x['key'] for x in jobs_summary] + ['123/1/3', '123/1/2', '123/1/1'] + +Job summary fieldset from ``iter()`` is less detailed than ``job.metadata``, +but contains few new fields as well. Additional fields can be requested using +the ``jobmeta`` parameter. If it used, then it's up to the user to list all the +required fields, so only few default fields would be added except requested +ones:: + + >>> job_summary = next(project.jobs.iter()) + >>> job_summary.get('spider', 'missing') + 'foo' + >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by', ]) + >>> job_summary = next(jobs_summary) + >>> job_summary.get('scheduled_by', 'missing') + 'John' + >>> job_summary.get('spider', 'missing') + missing + +By default ``jobs.iter()`` returns maximum last 1000 results. +Pagination is available using the ``start`` parameter:: + + >>> jobs_summary = spider.jobs.iter(start=1000) + +There are several filters like spider, state, has_tag, lacks_tag, +startts and endts (check `list endpoint`_ for more details). + +To get jobs filtered by tags:: + + >>> jobs_summary = project.jobs.iter(has_tag=['new', 'verified'], lacks_tag='obsolete') + +List of tags has ``OR`` power, so in the case above jobs with 'new' or +'verified' tag are expected. + +To get certain number of last finished jobs per some spider:: + + >>> jobs_summary = project.jobs.iter(spider='foo', state='finished', count=3) + +There are 4 possible job states, which can be used as values +for filtering by state: + +- pending +- running +- finished +- deleted + +Dict entries returned by ``iter`` method contain some additional meta, +but can be easily converted to ``Job`` instances with:: + + >>> [Job(x['key']) for x in jobs] + [ + , + , + , + ] + +summary +^^^^^^^ + +To check jobs summary:: + + >>> spider.jobs.summary() + [{'count': 0, 'name': 'pending', 'summary': []}, + {'count': 0, 'name': 'running', 'summary': []}, + {'count': 5, + 'name': 'finished', + 'summary': [...]} + +It's also possible to get last jobs summary (for each spider):: + + >>> list(sp.jobs.iter_last()) + [{'close_reason': 'success', + 'elapsed': 3062444, + 'errors': 1, + 'finished_time': 1482911633089, + 'key': '123/1/3', + 'logs': 8, + 'pending_time': 1482911596566, + 'running_time': 1482911598909, + 'spider': 'spider1', + 'state': 'finished', + 'ts': 1482911615830, + 'version': 'some-version'}] + +Note that there can be a lot of spiders, so the method above returns an iterator. + +Job +--- + +Job instance provides access to a job data with the following fields: + +- metadata +- items +- logs +- requests +- samples + +Request to cancel a job:: + + >>> job.cancel() + +To delete a job:: + + >>> job.delete() + +Metadata +^^^^^^^^ + +Job details can be found in jobs metadata and it's scrapystats:: + + >>> job.metadata.get('version') + '5123a86-master' + >>> job.metadata.get('scrapystats') + ... + 'downloader/response_count': 104, + 'downloader/response_status_count/200': 104, + 'finish_reason': 'finished', + 'finish_time': 1447160494937, + 'item_scraped_count': 50, + 'log_count/DEBUG': 157, + 'log_count/INFO': 1365, + 'log_count/WARNING': 3, + 'memusage/max': 182988800, + 'memusage/startup': 62439424, + ... + +Anything can be stored in metadata, here is example how to add tags:: + + >>> job.metadata.set('tags', ['obsolete']) + +Items +^^^^^ + +To retrieve all scraped items from a job:: + + >>> for item in job.items.iter(): + ... # do something with item (it's just a dict) + +Logs +^^^^ + +To retrieve all log entries from a job:: + + >>> for logitem in job.logs.iter(): + ... # logitem is a dict with level, message, time + >>> logitem + { + 'level': 20, + 'message': '[scrapy.core.engine] Closing spider (finished)', + 'time': 1482233733976}, + } + +Requests +^^^^^^^^ + +To retrieve all requests from a job:: + + >>> for reqitem in job.requests.iter(): + ... # reqitem is a dict + >>> reqitem + [{ + 'duration': 354, + 'fp': '6d748741a927b10454c83ac285b002cd239964ea', + 'method': 'GET', + 'rs': 1270, + 'status': 200, + 'time': 1482233733870, + 'url': 'https://example.com' + }] + +Samples +^^^^^^^ + +To retrieve all samples for a job:: + + >>> for sample in job.samples.iter(): + ... # sample is a list with a timestamp and data + >>> sample + [1482233732452, 0, 0, 0, 0, 0] + + +Activity +-------- + +To retrieve all activity events from a project:: + + >>> project.activity.iter() + + + >>> project.activity.list() + [{'event': 'job:completed', 'job': '123/2/3', 'user': 'jobrunner'}, + {'event': 'job:cancelled', 'job': '123/2/3', 'user': 'john'}] + +To post a new activity event:: + + >>> event = {'event': 'job:completed', 'job': '123/2/4', 'user': 'john'} + >>> project.activity.add(event) + +Or post multiple events at once:: + + >>> events = [ + {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, + {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, + ] + >>> project.activity.add(events) + + +Collections +----------- + +As an example, let's store hash and timestamp pair for foo spider. + +Usual workflow with `Collections`_ would be:: + + >>> collections = project.collections + >>> foo_store = collections.get_store('foo_store') + >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}) + >>> foo_store.count() + 1 + >>> foo_store.get('002d050ee3ff6192dcbecc4e4b4457d7') + {u'value': u'1447221694537'} + >>> # iterate over _key & value pair + ... list(foo_store.iter()) + [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] + >>> # filter by multiple keys - only values for keys that exist will be returned + ... list(foo_store.iter(key=['002d050ee3ff6192dcbecc4e4b4457d7', 'blah'])) + [{u'_key': u'002d050ee3ff6192dcbecc4e4b4457d7', u'value': u'1447221694537'}] + >>> foo_store.delete('002d050ee3ff6192dcbecc4e4b4457d7') + >>> foo_store.count() + 0 + +Collections are available on project level only. + +Frontiers +--------- + +Typical workflow with `Frontier`_:: + + >>> frontiers = project.frontiers + +Get all frontiers from a project to iterate through it:: + + >>> frontiers.iter() + + +List all frontiers:: + + >>> frontiers.list() + ['test', 'test1', 'test2'] + +Get a frontier by name:: + + >>> frontier = frontiers.get('test') + >>> frontier + + +Get an iterator to iterate through a frontier slots:: + + >>> frontier.iter() + + +List all slots:: + + >>> frontier.list() + ['example.com', 'example.com2'] + +Get a frontier slot by name:: + + >>> slot = frontier.get('example.com') + >>> slot + + +Add a request to the slot:: + + >>> slot.queue.add([{'fp': '/some/path.html'}]) + >>> slot.flush() + >>> slot.newcount + 1 + +``newcount`` is defined per slot, but also available per frontier and globally:: + + >>> frontier.newcount + 1 + >>> frontiers.newcount + 3 + +Add a fingerprint only to the slot:: + + >>> slot.fingerprints.add(['fp1', 'fp2']) + >>> slot.flush() + +There are convenient shortcuts: ``f`` for ``fingerprints`` and ``q`` for ``queue``. + +Add requests with additional parameters:: + + >>> slot.q.add([{'fp': '/'}, {'fp': 'page1.html', 'p': 1, 'qdata': {'depth': 1}}]) + >>> slot.flush() + +To retrieve all requests for a given slot:: + + >>> reqs = slot.q.iter() + +To retrieve all fingerprints for a given slot:: + + >>> fps = slot.f.iter() + +To list all the requests use ``list()`` method (similar for ``fingerprints``):: + + >>> fps = slot.q.list() + +To delete a batch of requests:: + + >>> slot.q.delete('00013967d8af7b0001') + +To delete the whole slot from the frontier:: + + >>> slot.delete() + +Flush data of the given frontier:: + + >>> frontier.flush() + +Flush data of all frontiers of a project:: + + >>> frontiers.flush() + +Close batch writers of all frontiers of a project:: + + >>> frontiers.close() + +Frontiers are available on project level only. + +Tags +---- + +Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). + +To mark a job with tag ``consumed``:: + + >>> job.update_tags(add=['consumed']) + +To mark all spider jobs with tag ``consumed``:: + + >>> spider.jobs.update_tags(add=['consumed']) + +To remove existing tag ``existing`` for all spider jobs:: + + >>> spider.jobs.update_tags(remove=['existing']) + +Modifying tags is available on spider/job levels. + + +Exceptions +---------- + +scrapinghub.exceptions.ScrapinghubAPIError +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Base exception class. + + +scrapinghub.exceptions.InvalidUsage +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Usually raised in case of 400 response from API. + + +scrapinghub.exceptions.NotFound +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Entity doesn't exist (e.g. spider or project). + + +scrapinghub.exceptions.ValueTooLarge +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Value cannot be writtent because it exceeds size limits. + +scrapinghub.exceptions.DuplicateJobError +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Job for given spider with given arguments is already scheduled or running. + + +.. _Scrapinghub API: http://doc.scrapinghub.com/api.html +.. _Collections: http://doc.scrapinghub.com/api/collections.html +.. _Frontier: http://doc.scrapinghub.com/api/frontier.html +.. _count endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-count +.. _list endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-list diff --git a/docs/testing.rst b/docs/testing.rst new file mode 100644 index 00000000..5c7fdb3b --- /dev/null +++ b/docs/testing.rst @@ -0,0 +1,28 @@ +Integration tests +================= + +The package is covered with integration tests based on `VCR.py library`_: there +are recorded cassettes files in ``tests/*/cassettes`` used instead of HTTP +requests to real services, it helps to simplify and speed up development. + +By default, tests use VCR.py ``once`` mode to: + +- replay previously recorded interactions. +- record new interactions if there is no cassette file. +- cause an error to be raised for new requests if there is a cassette file. + +It means that if you add new integration tests and run all tests as usual, +only new cassettes will be created, all existing cassettes will stay unmodified. + +To ignore existing cassettes and use real service, please provide a flag:: + + py.test --ignore-cassettes + +If you want to update/recreate all the cassettes from scratch, please use:: + + py.test --update-cassettes + +Note that internally the above command erases the whole folder with cassettes. + + +.. _VCR.py library: https://pypi.python.org/pypi/vcrpy From d063317b0b75eb1a42399303cb986ac071bca23e Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 17:54:03 +0300 Subject: [PATCH 06/40] Fix readme, describe exceptions changes --- README.rst | 4 ++-- docs/index.rst | 5 ----- docs/overview.rst | 17 +++++++++++++++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 988d7489..66c9868b 100644 --- a/README.rst +++ b/README.rst @@ -26,8 +26,8 @@ response time and improved bandwidth usage:: Documentation ------------- -Documentation is available online via Read the Docs: -https://python-scrapinghub.readthedocs.io/, or in the ``docs`` directory. +Documentation is available [online](https://python-scrapinghub.readthedocs.io/) via Read the Docs, +or in the ``docs`` directory. .. _Scrapinghub API: http://doc.scrapinghub.com/api.html diff --git a/docs/index.rst b/docs/index.rst index 0d43899d..3a7f9fb3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,8 +1,3 @@ -.. python-scrapinghub documentation master file, created by - sphinx-quickstart on Fri Mar 24 12:28:40 2017. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - ==================================== Client interface for Scrapinghub API ==================================== diff --git a/docs/overview.rst b/docs/overview.rst index e5c555a1..5c6c0128 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -555,12 +555,18 @@ scrapinghub.exceptions.ScrapinghubAPIError Base exception class. -scrapinghub.exceptions.InvalidUsage -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +scrapinghub.exceptions.BadRequest +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Usually raised in case of 400 response from API. +scrapinghub.exceptions.Unauthorized +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Not enough access to some resources. + + scrapinghub.exceptions.NotFound ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -572,12 +578,19 @@ scrapinghub.exceptions.ValueTooLarge Value cannot be writtent because it exceeds size limits. + scrapinghub.exceptions.DuplicateJobError ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Job for given spider with given arguments is already scheduled or running. +scrapinghub.exceptions.ServerError +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Indicates some server error: something unexpected has happened. + + .. _Scrapinghub API: http://doc.scrapinghub.com/api.html .. _Collections: http://doc.scrapinghub.com/api/collections.html .. _Frontier: http://doc.scrapinghub.com/api/frontier.html From 296df7a6b8c8762a03c0dddbb08ea0707c7f3663 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 17:55:50 +0300 Subject: [PATCH 07/40] Fix link in main readme --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 66c9868b..8b71ab15 100644 --- a/README.rst +++ b/README.rst @@ -26,8 +26,8 @@ response time and improved bandwidth usage:: Documentation ------------- -Documentation is available [online](https://python-scrapinghub.readthedocs.io/) via Read the Docs, -or in the ``docs`` directory. +Documentation is `available online`_ via Read the Docs or in the ``docs`` directory. .. _Scrapinghub API: http://doc.scrapinghub.com/api.html +.. _available online: https://python-scrapinghub.readthedocs.io/ From 40c4d39b300d9f85f946a11dee5f6c2272605803 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 18:34:10 +0300 Subject: [PATCH 08/40] Try to make docs more readable --- .../apidocs.rst} | 52 +++++++++---------- docs/{ => client}/overview.rst | 4 +- docs/index.rst | 40 +++++++++----- .../connection.rst} | 2 + .../hubstorage.rst} | 22 ++++---- docs/testing.rst | 28 ---------- requirements-docs.txt | 2 + scrapinghub/client/collections.py | 3 +- 8 files changed, 72 insertions(+), 81 deletions(-) rename docs/{client_apidocs.rst => client/apidocs.rst} (61%) rename docs/{ => client}/overview.rst (99%) rename docs/{legacy_connection.rst => legacy/connection.rst} (99%) rename docs/{legacy_hubstorage.rst => legacy/hubstorage.rst} (98%) delete mode 100644 docs/testing.rst create mode 100644 requirements-docs.txt diff --git a/docs/client_apidocs.rst b/docs/client/apidocs.rst similarity index 61% rename from docs/client_apidocs.rst rename to docs/client/apidocs.rst index 9ce0c8de..62accf98 100644 --- a/docs/client_apidocs.rst +++ b/docs/client/apidocs.rst @@ -1,5 +1,5 @@ -scrapinghub.client package -========================== +Code-based documentation +======================== Module contents --------------- @@ -13,96 +13,96 @@ Module contents Submodules ---------- -scrapinghub.client.activity module ----------------------------------- +activity +-------- .. automodule:: scrapinghub.client.activity :members: :undoc-members: :show-inheritance: -scrapinghub.client.collections module -------------------------------------- +collections +----------- .. automodule:: scrapinghub.client.collections :members: :undoc-members: :show-inheritance: -scrapinghub.client.exceptions module ------------------------------------- +exceptions +---------- .. automodule:: scrapinghub.client.exceptions :members: :undoc-members: :show-inheritance: -scrapinghub.client.frontiers module ------------------------------------ +frontiers +--------- .. automodule:: scrapinghub.client.frontiers :members: :undoc-members: :show-inheritance: -scrapinghub.client.items module -------------------------------- +items +----- .. automodule:: scrapinghub.client.items :members: :undoc-members: :show-inheritance: -scrapinghub.client.jobs module ------------------------------- +jobs +---- .. automodule:: scrapinghub.client.jobs :members: :undoc-members: :show-inheritance: -scrapinghub.client.logs module ------------------------------- +logs +---- .. automodule:: scrapinghub.client.logs :members: :undoc-members: :show-inheritance: -scrapinghub.client.projects module ----------------------------------- +projects +-------- .. automodule:: scrapinghub.client.projects :members: :undoc-members: :show-inheritance: -scrapinghub.client.requests module ----------------------------------- +requests +-------- .. automodule:: scrapinghub.client.requests :members: :undoc-members: :show-inheritance: -scrapinghub.client.samples module ---------------------------------- +samples +------- .. automodule:: scrapinghub.client.samples :members: :undoc-members: :show-inheritance: -scrapinghub.client.spiders module ---------------------------------- +spiders +------- .. automodule:: scrapinghub.client.spiders :members: :undoc-members: :show-inheritance: -scrapinghub.client.utils module -------------------------------- +utils +----- .. automodule:: scrapinghub.client.utils :members: diff --git a/docs/overview.rst b/docs/client/overview.rst similarity index 99% rename from docs/overview.rst rename to docs/client/overview.rst index 5c6c0128..40361636 100644 --- a/docs/overview.rst +++ b/docs/client/overview.rst @@ -1,5 +1,5 @@ -ScrapinghubClient overview -========================== +Overview +======== The ``scrapinghub.ScrapinghubClient`` is a new Python client for communicating with the `Scrapinghub API`_. It takes best from ``scrapinghub.Connection`` and diff --git a/docs/index.rst b/docs/index.rst index 3a7f9fb3..48f3e9eb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,11 +33,10 @@ ScrapinghubClient ================= .. toctree:: - :maxdepth: 2 - - overview - client_apidocs + :maxdepth: 1 + client/overview + client/apidocs Legacy clients ============== @@ -45,22 +44,35 @@ Legacy clients .. toctree:: :maxdepth: 2 - legacy_connection - legacy_hubstorage + legacy/connection + legacy/hubstorage Tests ===== -.. toctree:: - :maxdepth: 2 +The package is covered with integration tests based on `VCR.py library`_: there +are recorded cassettes files in ``tests/*/cassettes`` used instead of HTTP +requests to real services, it helps to simplify and speed up development. + +By default, tests use VCR.py ``once`` mode to: + +- replay previously recorded interactions. +- record new interactions if there is no cassette file. +- cause an error to be raised for new requests if there is a cassette file. + +It means that if you add new integration tests and run all tests as usual, +only new cassettes will be created, all existing cassettes will stay unmodified. + +To ignore existing cassettes and use real service, please provide a flag:: + + py.test --ignore-cassettes + +If you want to update/recreate all the cassettes from scratch, please use:: - testing + py.test --update-cassettes +Note that internally the above command erases the whole folder with cassettes. -Indices and tables -================== -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` +.. _VCR.py library: https://pypi.python.org/pypi/vcrpy diff --git a/docs/legacy_connection.rst b/docs/legacy/connection.rst similarity index 99% rename from docs/legacy_connection.rst rename to docs/legacy/connection.rst index e46a6a7c..891c8eef 100644 --- a/docs/legacy_connection.rst +++ b/docs/legacy/connection.rst @@ -3,6 +3,8 @@ scrapinghub.legacy.Connection TODO add short description & deprecation message here! +Overview +-------- First, you connect to Scrapinghub:: diff --git a/docs/legacy_hubstorage.rst b/docs/legacy/hubstorage.rst similarity index 98% rename from docs/legacy_hubstorage.rst rename to docs/legacy/hubstorage.rst index 3a3fe028..6b5df061 100644 --- a/docs/legacy_hubstorage.rst +++ b/docs/legacy/hubstorage.rst @@ -6,6 +6,10 @@ TODO add short description & deprecation message here! The library can also be used for interaction with spiders, jobs and scraped data through ``storage.scrapinghub.com`` endpoints. + +Overview +-------- + First, use your API key for authorization:: >>> from scrapinghub import HubstorageClient @@ -14,7 +18,7 @@ First, use your API key for authorization:: 1446222762611 Project -------- +^^^^^^^ To get project settings or jobs summary:: @@ -29,7 +33,7 @@ To get project settings or jobs summary:: u'running': 0} Spider ------- +^^^^^^ To get spider id correlated with its name:: @@ -45,7 +49,7 @@ To get job summary per spider:: >>> summary = project.spiders.lastjobsummary(spiderid='1') Job ---- +^^^ Job can be **retrieved** directly by id (project_id/spider_id/job_id):: @@ -86,7 +90,7 @@ To **delete** job:: u'deleted' Job details ------------ +^^^^^^^^^^^ Job details can be found in jobs metadata and it's scrapystats:: @@ -112,7 +116,7 @@ Anything can be stored in metadata, here is example how to add tags:: >>> job.update_metadata({'tags': 'obsolete'}) Jobs ----- +^^^^ To iterate through all jobs metadata per project (descending order):: @@ -158,7 +162,7 @@ There are 4 possible job states, which can be used as values for filtering by st Items ------ +^^^^^ To iterate through items:: @@ -167,7 +171,7 @@ To iterate through items:: # do something, item is just a dict Logs ----- +^^^^ To iterate through 10 first logs for example:: @@ -176,7 +180,7 @@ To iterate through 10 first logs for example:: # do something, log is a dict with log level, message and time keys Collections ------------ +^^^^^^^^^^^ Let's store hash and timestamp pair for foo spider. Usual workflow with `Collections`_ would be:: @@ -198,7 +202,7 @@ Let's store hash and timestamp pair for foo spider. Usual workflow with `Collect 0 Frontier --------- +^^^^^^^^ Typical workflow with `Frontier`_:: diff --git a/docs/testing.rst b/docs/testing.rst deleted file mode 100644 index 5c7fdb3b..00000000 --- a/docs/testing.rst +++ /dev/null @@ -1,28 +0,0 @@ -Integration tests -================= - -The package is covered with integration tests based on `VCR.py library`_: there -are recorded cassettes files in ``tests/*/cassettes`` used instead of HTTP -requests to real services, it helps to simplify and speed up development. - -By default, tests use VCR.py ``once`` mode to: - -- replay previously recorded interactions. -- record new interactions if there is no cassette file. -- cause an error to be raised for new requests if there is a cassette file. - -It means that if you add new integration tests and run all tests as usual, -only new cassettes will be created, all existing cassettes will stay unmodified. - -To ignore existing cassettes and use real service, please provide a flag:: - - py.test --ignore-cassettes - -If you want to update/recreate all the cassettes from scratch, please use:: - - py.test --update-cassettes - -Note that internally the above command erases the whole folder with cassettes. - - -.. _VCR.py library: https://pypi.python.org/pypi/vcrpy diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 00000000..b18e9df9 --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,2 @@ +sphinx==1.5.3 +sphinx_rtd_theme==0.2.4 diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index ddb37507..09f8ac2b 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -172,10 +172,9 @@ def list(self, key=None, prefix=None, prefixcount=None, startts=None, :param \*\*params: (optional) additional query params for the request. :return: a list of items where each item is represented with a dict. :rtype: list of dicts - + """ # FIXME there should be similar docstrings for iter/iter_raw_json # but as we proxy them as-is, it's not in place, should be improved - """ update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) From 525754efbbc21de32f21a03b99f7b3718c291ffe Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 18:52:22 +0300 Subject: [PATCH 09/40] Add basic usage to quickstart, refactor docs --- docs/index.rst | 64 +------------------------------- docs/legacy/clients.rst | 8 ++++ docs/quickstart.rst | 82 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 62 deletions(-) create mode 100644 docs/legacy/clients.rst create mode 100644 docs/quickstart.rst diff --git a/docs/index.rst b/docs/index.rst index 48f3e9eb..ea9f6021 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,72 +7,12 @@ Client interface for Scrapinghub API The ``scrapinghub`` is a Python library for communicating with the `Scrapinghub API`_. - .. _Scrapinghub API: http://doc.scrapinghub.com/api.html -Requirements -============ - -* Python 2.7 or above - - -Installation -============ - -The quick way:: - - pip install scrapinghub - -You can also install the library with MessagePack support, it provides better -response time and improved bandwidth usage:: - - pip install scrapinghub[msgpack] - - -ScrapinghubClient -================= - .. toctree:: :maxdepth: 1 + quickstart client/overview client/apidocs - -Legacy clients -============== - -.. toctree:: - :maxdepth: 2 - - legacy/connection - legacy/hubstorage - - -Tests -===== - -The package is covered with integration tests based on `VCR.py library`_: there -are recorded cassettes files in ``tests/*/cassettes`` used instead of HTTP -requests to real services, it helps to simplify and speed up development. - -By default, tests use VCR.py ``once`` mode to: - -- replay previously recorded interactions. -- record new interactions if there is no cassette file. -- cause an error to be raised for new requests if there is a cassette file. - -It means that if you add new integration tests and run all tests as usual, -only new cassettes will be created, all existing cassettes will stay unmodified. - -To ignore existing cassettes and use real service, please provide a flag:: - - py.test --ignore-cassettes - -If you want to update/recreate all the cassettes from scratch, please use:: - - py.test --update-cassettes - -Note that internally the above command erases the whole folder with cassettes. - - -.. _VCR.py library: https://pypi.python.org/pypi/vcrpy + legacy/clients diff --git a/docs/legacy/clients.rst b/docs/legacy/clients.rst new file mode 100644 index 00000000..f5194d2f --- /dev/null +++ b/docs/legacy/clients.rst @@ -0,0 +1,8 @@ +Legacy clients +============== + +.. toctree:: + :maxdepth: 2 + + connection + hubstorage diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 00000000..e8c0adfd --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,82 @@ +Quickstart +========== + +Requirements +------------ + +* Python 2.7 or above + + +Installation +------------ + +The quick way:: + + pip install scrapinghub + +You can also install the library with MessagePack support, it provides better +response time and improved bandwidth usage:: + + pip install scrapinghub[msgpack] + + +Basic usage +----------- + +Instantiate new client:: + + >>> from scrapinghub import ScrapinghubClient + >>> client = ScrapinghubClient('APIKEY') + +Work with your projects:: + + >>> client.projects.list() + [123, 456] + +Run new jobs from the client:: + + >>> project.jobs.run('spider1', job_args={'arg1':'val1'}) + > + +Access job data:: + + >>> job = project.job(u'123/1/2') + >>> for item in job.items(): + ... print(item) + { + 'name': ['Some other item'], + 'url': 'http://some-url/other-item.html', + 'size': 35000, + } + +Many more feature are awaiting for you. + + +Tests +----- + +The package is covered with integration tests based on `VCR.py library`_: there +are recorded cassettes files in ``tests/*/cassettes`` used instead of HTTP +requests to real services, it helps to simplify and speed up development. + +By default, tests use VCR.py ``once`` mode to: + +- replay previously recorded interactions. +- record new interactions if there is no cassette file. +- cause an error to be raised for new requests if there is a cassette file. + +It means that if you add new integration tests and run all tests as usual, +only new cassettes will be created, all existing cassettes will stay unmodified. + +To ignore existing cassettes and use real service, please provide a flag:: + + py.test --ignore-cassettes + +If you want to update/recreate all the cassettes from scratch, please use:: + + py.test --update-cassettes + +Note that internally the above command erases the whole folder with cassettes. + + +.. _VCR.py library: https://pypi.python.org/pypi/vcrpy From 388efbab1f8ae9c54b28b47a898769293c294c30 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 19:05:03 +0300 Subject: [PATCH 10/40] Minor fixes for some docstrings --- docs/conf.py | 3 ++- docs/quickstart.rst | 5 +++-- scrapinghub/client/jobs.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 432901b3..4b9d6064 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -95,7 +95,8 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] +html_static_path = [] # -- Options for HTMLHelp output ------------------------------------------ diff --git a/docs/quickstart.rst b/docs/quickstart.rst index e8c0adfd..911cf7ee 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -35,12 +35,13 @@ Work with your projects:: Run new jobs from the client:: + >>> project = client.get_project(123) >>> project.jobs.run('spider1', job_args={'arg1':'val1'}) > -Access job data:: +Access your jobs data:: - >>> job = project.job(u'123/1/2') + >>> job = client.get_job('123/1/2') >>> for item in job.items(): ... print(item) { diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index 62bc53f6..e9c21e49 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -385,7 +385,7 @@ class Job(object): Usage:: - >>> job = project.job('123/1/2') + >>> job = project.jobs.get('123/1/2') >>> job.key '123/1/2' >>> job.metadata.get('state') From 495064884fff6227b32130cf5f70372d5f5761ed Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Fri, 24 Mar 2017 19:09:12 +0300 Subject: [PATCH 11/40] Install main requirements for docs --- requirements-docs.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-docs.txt b/requirements-docs.txt index b18e9df9..8284e0b7 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -1,2 +1,3 @@ +-r requirements.txt sphinx==1.5.3 sphinx_rtd_theme==0.2.4 From 30d46bae39516c9038c35eab9fd5012f4ac92566 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 27 Mar 2017 11:12:51 +0300 Subject: [PATCH 12/40] Fix copyright, use typing notation for lists --- docs/conf.py | 11 +++++++++-- scrapinghub/client/collections.py | 4 ++-- scrapinghub/client/frontiers.py | 6 +++--- scrapinghub/client/jobs.py | 4 ++-- scrapinghub/client/projects.py | 4 ++-- scrapinghub/client/spiders.py | 4 ++-- 6 files changed, 20 insertions(+), 13 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 4b9d6064..30c7e1d8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,9 +18,16 @@ # import os import sys +from datetime import datetime + + sys.path.insert(0, os.path.abspath('..')) + from scrapinghub import __version__ # noqa + + +YEAR = datetime.now().year VERSION = __version__.rsplit('.', 2)[0] # -- General configuration ------------------------------------------------ @@ -48,8 +55,8 @@ # General information about the project. project = u'scrapinghub' -copyright = u'2017, Pablo Hoffman, Daniel Graña' -author = u'Pablo Hoffman, Daniel Graña' +copyright = u'2010-{}, Scrapinghub'.format(YEAR) +author = u'Scrapinghub' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index 09f8ac2b..e3f19139 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -91,7 +91,7 @@ def list(self): :return: a list of collections where each collection is represented by a dictionary with ('name','type') fields. - :rtype: list of dicts + :rtype: List[dict] """ return list(self.iter()) @@ -171,7 +171,7 @@ def list(self, key=None, prefix=None, prefixcount=None, startts=None, :param requests_params: (optional) a dict with optional requests params. :param \*\*params: (optional) additional query params for the request. :return: a list of items where each item is represented with a dict. - :rtype: list of dicts + :rtype: List[dict] """ # FIXME there should be similar docstrings for iter/iter_raw_json # but as we proxy them as-is, it's not in place, should be improved diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index dedad31a..62a81685 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -171,7 +171,7 @@ def list(self): """List all slots. :return: a list of frontier slots names. - :rtype: list of strings + :rtype: List[str] """ return next(self._frontiers._origin.apiget((self.key, 'list'))) @@ -315,7 +315,7 @@ def list(self, **params): :param \*\*params: (optional) additional query params for the request. :return: a list of fingerprints. - :rtype: list of strings + :rtype: List[str] """ return list(self.iter(**params)) @@ -353,7 +353,7 @@ def list(self, mincount=None, **params): :param \*\*params: (optional) additional query params for the request. :return: a list of request batches in the queue where each batch is represented with a dict with ('id', 'requests') field. - :rtype: list of dicts + :rtype: List[dict] """ return list(self.iter(mincount=mincount, **params)) diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index e9c21e49..a788d1c2 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -164,7 +164,7 @@ def list(self, count=None, start=None, spider=None, state=None, :param \*\*params: (optional) other filter params. :return: list of dictionaries of jobs summary for a given filter params - :rtype: list of dicts + :rtype: List[dict] Please note that list() method can use a lot of memory and for a large amount of jobs it's recommended to iterate through it via iter() @@ -260,7 +260,7 @@ def summary(self, state=None, spider=None, **params): :param \*\*params: (optional) additional keyword args. :return: a list of dictionaries of jobs summary for a given filter params grouped by job state. - :rtype: list of dicts + :rtype: List[dict] Usage:: diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index 5e9e9bec..eff3c6aa 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -46,7 +46,7 @@ def list(self): """Get list of projects available to current user. :return: a list of project ids. - :rtype: list of integers + :rtype: List[int] Usage:: @@ -72,7 +72,7 @@ def summary(self, state=None, **params): :return: a list of dictionaries: each dictionary represents a project summary (amount of pending/running/finished jobs and a flag if it has a capacity to run new jobs). - :rtype: list of dicts + :rtype: List[dict] Usage:: diff --git a/scrapinghub/client/spiders.py b/scrapinghub/client/spiders.py index 9bfe302e..8c7c6d79 100644 --- a/scrapinghub/client/spiders.py +++ b/scrapinghub/client/spiders.py @@ -52,7 +52,7 @@ def list(self): """Get a list of spiders for a project. :return: a list of dictionaries with spiders metadata. - :rtype: list of dicts + :rtype: List[dict] Usage:: @@ -122,7 +122,7 @@ def list_tags(self): """List spider tags. :return: a list of spider tags. - :rtype: list of strings + :rtype: List[str] """ path = 'v2/projects/{}/spiders/{}'.format(self.project_id, self._id) url = urljoin(self._client._connection.url, path) From 229e0f0c87b8a76cd8e3a646ad80666721156b91 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 27 Mar 2017 11:22:47 +0300 Subject: [PATCH 13/40] Update schedule with run in README, fix title --- README.rst | 8 ++++++-- docs/client/overview.rst | 24 ++++++++++++------------ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/README.rst b/README.rst index 8b71ab15..57386e53 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,9 @@ -Scrapinghub command line client -=============================== +==================================== +Client interface for Scrapinghub API +==================================== + +.. image:: https://secure.travis-ci.org/scrapinghub/python-scrapinghub.png?branch=master + :target: http://travis-ci.org/scrapinghub/python-scrapinghub The ``scrapinghub`` is a Python library for communicating with the `Scrapinghub API`_. diff --git a/docs/client/overview.rst b/docs/client/overview.rst index 40361636..2fb5f967 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -55,7 +55,7 @@ Jobs instance is described well in ``Jobs`` section below. For example, to schedule a spider run (it returns a job object):: - >>> project.jobs.schedule('spider1', job_args={'arg1':'val1'}) + >>> project.jobs.run('spider1', job_args={'arg1':'val1'}) > Project instance also has the following fields: @@ -118,7 +118,7 @@ Like project instance, spider instance has ``jobs`` field to work with the spide To schedule a spider run:: - >>> spider.jobs.schedule(job_args={'arg1:'val1'}) + >>> spider.jobs.run(job_args={'arg1:'val1'}) > Note that you don't need to specify spider name explicitly. @@ -141,30 +141,30 @@ Also there's a shortcut to get same job with client instance:: >>> job = client.get_job('123/1/2') -schedule -^^^^^^^^ +run +^^^ -Use ``schedule`` method to schedule a new job for project/spider:: +Use ``run`` method to run a new job for project/spider:: - >>> job = spider.jobs.schedule() + >>> job = spider.jobs.run() Scheduling logic supports different options, like -- spider_args to provide spider arguments for the job -- units to specify amount of units to schedule the job +- job_args to provide spider arguments for the job +- units to specify amount of units to run the job - job_settings to pass additional settings for the job - priority to set higher/lower priority of the job - add_tag to create a job with a set of initial tags - meta to pass additional custom metadata -For example, to schedule a new job for a given spider with custom params:: +For example, to run a new job for a given spider with custom params:: - >>> job = spider.jobs.schedule(units=2, job_settings={'SETTING': 'VALUE'}, + >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, priority=1, add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) -Note that if you schedule a job on project level, spider name is required:: +Note that if you run a job on project level, spider name is required:: - >>> job = project.jobs.schedule('spider1') + >>> job = project.jobs.run('spider1') count ^^^^^ From fc38119e9aaf49d1afc91f1d946bce903761b036 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 27 Mar 2017 12:01:48 +0300 Subject: [PATCH 14/40] Minor improvements for the docs --- docs/client/overview.rst | 38 +++++++++++++++++++++++++++++++------- docs/legacy/connection.rst | 11 ++++++++--- docs/legacy/hubstorage.rst | 11 ++++++----- docs/quickstart.rst | 10 ++++++---- 4 files changed, 51 insertions(+), 19 deletions(-) diff --git a/docs/client/overview.rst b/docs/client/overview.rst index 2fb5f967..4383b311 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -44,7 +44,7 @@ And select a particular project to work with:: >>> project.key '123' -The above is a shortcut for ``client.projects.get(123)``. +.. tip:: The above is a shortcut for ``client.projects.get(123)``. Project ------- @@ -60,13 +60,15 @@ For example, to schedule a spider run (it returns a job object):: Project instance also has the following fields: -- activity - access to project activity records -- collections - work with project collections (see ``Collections`` section) -- frontiers - using project frontier (see ``Frontiers`` section) -- settings - interface to project settings -- spiders - access to spiders collection (see ``Spiders`` section) +- **activity** - access to :ref:`project activity ` records +- **collections** - work with :ref:`project collections ` +- **frontiers** - using :ref:`project frontiers ` +- **settings** - interface to :ref:`project settings ` +- **spiders** - access to :ref:`spiders collection ` +.. _project-settings: + Settings -------- @@ -89,6 +91,7 @@ Or update a few project settings at once:: >>> project.settings.update({'default_job_units': 1, ... 'job_runtime_limit': 20}) +.. _project-spiders: Spiders ------- @@ -111,6 +114,8 @@ To select a particular spider to work with:: >>> spider.name spider2 +.. _spider: + Spider ------ @@ -123,6 +128,8 @@ To schedule a spider run:: Note that you don't need to specify spider name explicitly. +.. _jobs: + Jobs ---- @@ -299,6 +306,8 @@ To delete a job:: >>> job.delete() +.. _job-metadata: + Metadata ^^^^^^^^ @@ -324,6 +333,8 @@ Anything can be stored in metadata, here is example how to add tags:: >>> job.metadata.set('tags', ['obsolete']) +.. _job-items: + Items ^^^^^ @@ -332,6 +343,8 @@ To retrieve all scraped items from a job:: >>> for item in job.items.iter(): ... # do something with item (it's just a dict) +.. _job-logs: + Logs ^^^^ @@ -346,6 +359,8 @@ To retrieve all log entries from a job:: 'time': 1482233733976}, } +.. _job-requests: + Requests ^^^^^^^^ @@ -364,6 +379,8 @@ To retrieve all requests from a job:: 'url': 'https://example.com' }] +.. _job-samples: + Samples ^^^^^^^ @@ -374,6 +391,7 @@ To retrieve all samples for a job:: >>> sample [1482233732452, 0, 0, 0, 0, 0] +.. _project-activity: Activity -------- @@ -401,6 +419,8 @@ Or post multiple events at once:: >>> project.activity.add(events) +.. _project-collections: + Collections ----------- @@ -427,6 +447,8 @@ Usual workflow with `Collections`_ would be:: Collections are available on project level only. +.. _project-frontiers: + Frontiers --------- @@ -526,6 +548,8 @@ Close batch writers of all frontiers of a project:: Frontiers are available on project level only. +.. _job-tags: + Tags ---- @@ -552,7 +576,7 @@ Exceptions scrapinghub.exceptions.ScrapinghubAPIError ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Base exception class. +Base exception class for all other exceptions listed below. scrapinghub.exceptions.BadRequest diff --git a/docs/legacy/connection.rst b/docs/legacy/connection.rst index 891c8eef..11e53351 100644 --- a/docs/legacy/connection.rst +++ b/docs/legacy/connection.rst @@ -1,7 +1,9 @@ -scrapinghub.legacy.Connection -============================= +scrapinghub.Connection +====================== -TODO add short description & deprecation message here! +The module is the very first Python library for communicating with the Scrapinghub API. + +[WARNING] It is deprecated, please use `scrapinghub.ScrapinghubClient`_ instead. Overview -------- @@ -106,3 +108,6 @@ Module contents :members: :undoc-members: :show-inheritance: + + +.. _scrapinghub.ScrapinghubClient: ../client/overview.html diff --git a/docs/legacy/hubstorage.rst b/docs/legacy/hubstorage.rst index 6b5df061..5a024faa 100644 --- a/docs/legacy/hubstorage.rst +++ b/docs/legacy/hubstorage.rst @@ -1,10 +1,9 @@ -scrapinghub.hubstorage.HubstorageClient -======================================= +scrapinghub.HubstorageClient +============================ -TODO add short description & deprecation message here! +The library can be used for interaction with spiders, jobs and scraped data through ``storage.scrapinghub.com`` endpoints. - -The library can also be used for interaction with spiders, jobs and scraped data through ``storage.scrapinghub.com`` endpoints. +[WARNING] It is deprecated, please use `scrapinghub.ScrapinghubClient`_ instead. Overview @@ -246,3 +245,5 @@ Module contents :members: :undoc-members: :show-inheritance: + +.. _scrapinghub.ScrapinghubClient: ../client/overview.html diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 911cf7ee..f484bd35 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -14,7 +14,7 @@ The quick way:: pip install scrapinghub -You can also install the library with MessagePack support, it provides better +You can also install the library with `MessagePack`_ support, it provides better response time and improved bandwidth usage:: pip install scrapinghub[msgpack] @@ -50,13 +50,13 @@ Access your jobs data:: 'size': 35000, } -Many more feature are awaiting for you. +Many more features `are awaiting`_ for you. Tests ----- -The package is covered with integration tests based on `VCR.py library`_: there +The package is covered with integration tests based on `VCR.py`_ library: there are recorded cassettes files in ``tests/*/cassettes`` used instead of HTTP requests to real services, it helps to simplify and speed up development. @@ -80,4 +80,6 @@ If you want to update/recreate all the cassettes from scratch, please use:: Note that internally the above command erases the whole folder with cassettes. -.. _VCR.py library: https://pypi.python.org/pypi/vcrpy +.. _MessagePack: https://en.wikipedia.org/wiki/MessagePack +.. _are awaiting: client/overview.html +.. _VCR.py: https://pypi.python.org/pypi/vcrpy From 7c09b49fb80fb3aea078f9e8921fded0b4cbbe4a Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 27 Mar 2017 16:35:03 +0300 Subject: [PATCH 15/40] Minor style fixes --- docs/client/apidocs.rst | 4 ++-- docs/client/overview.rst | 6 +++--- scrapinghub/client/frontiers.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/client/apidocs.rst b/docs/client/apidocs.rst index 62accf98..a4d8fcb3 100644 --- a/docs/client/apidocs.rst +++ b/docs/client/apidocs.rst @@ -1,5 +1,5 @@ -Code-based documentation -======================== +API Reference +============= Module contents --------------- diff --git a/docs/client/overview.rst b/docs/client/overview.rst index 4383b311..271056c1 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -55,7 +55,7 @@ Jobs instance is described well in ``Jobs`` section below. For example, to schedule a spider run (it returns a job object):: - >>> project.jobs.run('spider1', job_args={'arg1':'val1'}) + >>> project.jobs.run('spider1', job_args={'arg1': 'val1'}) > Project instance also has the following fields: @@ -123,7 +123,7 @@ Like project instance, spider instance has ``jobs`` field to work with the spide To schedule a spider run:: - >>> spider.jobs.run(job_args={'arg1:'val1'}) + >>> spider.jobs.run(job_args={'arg1': 'val1'}) > Note that you don't need to specify spider name explicitly. @@ -157,7 +157,7 @@ Use ``run`` method to run a new job for project/spider:: Scheduling logic supports different options, like -- job_args to provide spider arguments for the job +- job_args to provide arguments for the job - units to specify amount of units to run the job - job_settings to pass additional settings for the job - priority to set higher/lower priority of the job diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index 62a81685..2ea7f325 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -105,7 +105,7 @@ def list(self): """List frontiers names. :return: a list of frontiers names. - :rtype: list of strings + :rtype: List[str] """ return next(self._origin.apiget('list')) From 98f4e85ae810b7e36e052f1220fa09217c60cff1 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 27 Mar 2017 18:37:25 +0300 Subject: [PATCH 16/40] Improve return types --- scrapinghub/client/__init__.py | 4 ++-- scrapinghub/client/collections.py | 20 ++++++++++---------- scrapinghub/client/frontiers.py | 26 +++++++++++++------------- scrapinghub/client/items.py | 2 +- scrapinghub/client/jobs.py | 24 ++++++++++++------------ scrapinghub/client/logs.py | 2 +- scrapinghub/client/projects.py | 8 ++++---- scrapinghub/client/spiders.py | 8 ++++---- scrapinghub/client/utils.py | 4 ++-- 9 files changed, 49 insertions(+), 49 deletions(-) diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index b201b12b..8c47f30e 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -58,7 +58,7 @@ def get_project(self, project_id): :param project_id: integer or string numeric project id. :return: :class:`Project` object. - :rtype: scrapinghub.client.projects.Project + :rtype: :class:`scrapinghub.client.projects.Project` Usage:: @@ -74,7 +74,7 @@ def get_job(self, job_key): :param job_key: job key string in format 'project_id/spider_id/job_id', where all the components are integers. :return: :class:`Job` object. - :rtype: scrapinghub.client.jobs.Job + :rtype: :class:`scrapinghub.client.jobs.Job` Usage:: diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index e3f19139..f2a95c38 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -30,7 +30,7 @@ def get(self, type_, name): :param type_: a collection type string. :param name: a collection name string. :return: :class:`Collection` object. - :rtype: Collection + :rtype: :class:`Collection` """ self._origin._validate_collection(type_, name) return Collection(self._client, self, type_, name) @@ -40,7 +40,7 @@ def get_store(self, name): :param name: a collection name string. :return: :class:`Collection` object. - :rtype: Collection + :rtype: :class:`Collection` """ return self.get('s', name) @@ -51,7 +51,7 @@ def get_cached_store(self, name): :param name: a collection name string. :return: :class:`Collection` object. - :rtype: Collection + :rtype: :class:`Collection` """ return self.get('cs', name) @@ -62,7 +62,7 @@ def get_versioned_store(self, name): :param name: a collection name string. :return: :class:`Collection` object. - :rtype: Collection + :rtype: :class:`Collection` """ return self.get('vs', name) @@ -73,7 +73,7 @@ def get_versioned_cached_store(self, name): :param name: a collection name string. :return: :class:`Collection` object. - :rtype: Collection + :rtype: :class:`Collection` """ return self.get('vcs', name) @@ -82,7 +82,7 @@ def iter(self): :return: an iterator over collections list where each collection is represented by a dictionary with ('name','type') fields. - :rtype: collections.Iterable[dict] + :rtype: :class:`collections.Iterable[dict]` """ return self._origin.apiget('list') @@ -91,7 +91,7 @@ def list(self): :return: a list of collections where each collection is represented by a dictionary with ('name','type') fields. - :rtype: List[dict] + :rtype: :class:`list[dict]` """ return list(self.iter()) @@ -171,7 +171,7 @@ def list(self, key=None, prefix=None, prefixcount=None, startts=None, :param requests_params: (optional) a dict with optional requests params. :param \*\*params: (optional) additional query params for the request. :return: a list of items where each item is represented with a dict. - :rtype: List[dict] + :rtype: :class:`list[dict]` """ # FIXME there should be similar docstrings for iter/iter_raw_json # but as we proxy them as-is, it's not in place, should be improved @@ -186,7 +186,7 @@ def get(self, key, **params): :param key: string item key. :param \*\*params: (optional) additional query params for the request. :return: an item dictionary if exists. - :rtype: dict + :rtype: :class:`dict` """ if key is None: raise ValueError("key cannot be None") @@ -228,7 +228,7 @@ def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, :param requests_params: (optional) a dict with optional requests params. :param \*\*params: (optional) additional query params for the request. :return: an iterator over items list packed with msgpack. - :rtype: collections.Iterable[bytes] + :rtype: :class:`collections.Iterable[bytes]` """ update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index 2ea7f325..be94f931 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -22,7 +22,7 @@ def _get_writer(self, frontier, slot): callback to write newcount data per slot. :return: a batchuploader writer instance. - :rtype: scrapinghub.hubstorage.batchuploader._BatchWriter + :rtype: :class:`scrapinghub.hubstorage.batchuploader._BatchWriter` """ key = (frontier, slot) writer = self._writers.get(key) @@ -89,7 +89,7 @@ def get(self, name): :param name: a frontier name string. :return: :class:`Frontier` instance. - :rtype: Frontier + :rtype: :class:`Frontier` """ return Frontier(self._client, self, name) @@ -97,7 +97,7 @@ def iter(self): """Iterate through frontiers. :return: an iterator over frontiers names. - :rtype: collections.Iterable[str] + :rtype: :class:`collections.Iterable[str]` """ return iter(self.list()) @@ -105,7 +105,7 @@ def list(self): """List frontiers names. :return: a list of frontiers names. - :rtype: List[str] + :rtype: :class:`list[str]` """ return next(self._origin.apiget('list')) @@ -155,7 +155,7 @@ def get(self, slot): """Get a slot by name. :return: :class:`FrontierSlot` instance. - :rtype: FrontierSlot + :rtype: :class:`FrontierSlot` """ return FrontierSlot(self._client, self, slot) @@ -163,7 +163,7 @@ def iter(self): """Iterate through slots. :return: an iterator over frontier slots names. - :rtype: collections.Iterate[str] + :rtype: :class:`collections.Iterate[str]` """ return iter(self.list()) @@ -171,7 +171,7 @@ def list(self): """List all slots. :return: a list of frontier slots names. - :rtype: List[str] + :rtype: :class:`list[str]` """ return next(self._frontiers._origin.apiget((self.key, 'list'))) @@ -250,7 +250,7 @@ def f(self): """Shortcut to have quick access to slot fingerprints. :return: :class:`FrontierSlotFingerprints` instance. - :rtype: FrontierSlotFingerprints + :rtype: :class:`FrontierSlotFingerprints` """ return self.fingerprints @@ -259,7 +259,7 @@ def q(self): """Shortcut to have quick access to a slot queue. :return: :class:`FrontierSlotQueue` instance. - :rtype: FrontierSlotQueue + :rtype: :class:`FrontierSlotQueue` """ return self.queue @@ -303,7 +303,7 @@ def iter(self, **params): :param \*\*params: (optional) additional query params for the request. :return: an iterator over fingerprints. - :rtype: collections.Iterable[str] + :rtype: :class:`collections.Iterable[str]` """ origin = self._frontier._frontiers._origin path = (self._frontier.key, 's', self.key, 'f') @@ -315,7 +315,7 @@ def list(self, **params): :param \*\*params: (optional) additional query params for the request. :return: a list of fingerprints. - :rtype: List[str] + :rtype: :class:`list[str]` """ return list(self.iter(**params)) @@ -339,7 +339,7 @@ def iter(self, mincount=None, **params): :param \*\*params: (optional) additional query params for the request. :return: an iterator over request batches in the queue where each batch is represented with a dict with ('id', 'requests') field. - :rtype: collections.Iterable[dict] + :rtype: :class:`collections.Iterable[dict]` """ origin = self._frontier._frontiers._origin path = (self._frontier.key, 's', self.key, 'q') @@ -353,7 +353,7 @@ def list(self, mincount=None, **params): :param \*\*params: (optional) additional query params for the request. :return: a list of request batches in the queue where each batch is represented with a dict with ('id', 'requests') field. - :rtype: List[dict] + :rtype: :class:`list[dict]` """ return list(self.iter(mincount=mincount, **params)) diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index a127e2e0..e8f4caa2 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -50,7 +50,7 @@ def _modify_iter_params(self, params): """Modify iter filter to convert offset to start parameter. :return: a dict with updated set of params. - :rtype: dict + :rtype: :class:`dict` """ params = super(Items, self)._modify_iter_params(params) offset = params.pop('offset', None) diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index a788d1c2..1c1bf784 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -58,7 +58,7 @@ def count(self, spider=None, state=None, has_tag=None, lacks_tag=None, :param \*\*params: (optional) other filter params. :return: jobs count. - :rtype: int + :rtype: :class:`int` Usage:: @@ -97,7 +97,7 @@ def iter(self, count=None, start=None, spider=None, state=None, :return: a generator object over a list of dictionaries of jobs summary for a given filter params. - :rtype: types.GeneratorType[dict] + :rtype: :class:`types.GeneratorType[dict]` Usage: @@ -164,7 +164,7 @@ def list(self, count=None, start=None, spider=None, state=None, :param \*\*params: (optional) other filter params. :return: list of dictionaries of jobs summary for a given filter params - :rtype: List[dict] + :rtype: :class:`list[dict]` Please note that list() method can use a lot of memory and for a large amount of jobs it's recommended to iterate through it via iter() @@ -195,7 +195,7 @@ def run(self, spider=None, units=None, priority=None, meta=None, :param \*\*params: (optional) additional keyword args. :return: a job key string pointing to the new job. - :rtype: str + :rtype: :class:`str` Usage:: @@ -236,7 +236,7 @@ def get(self, job_key): the spider (if :attr:`Spider.jobs` was used). :return: :class:`Job` object. - :rtype: scrapinghub.client.jobs.Job + :rtype: :class:`Job` Usage:: @@ -260,7 +260,7 @@ def summary(self, state=None, spider=None, **params): :param \*\*params: (optional) additional keyword args. :return: a list of dictionaries of jobs summary for a given filter params grouped by job state. - :rtype: List[dict] + :rtype: :class:`list[dict]` Usage:: @@ -288,7 +288,7 @@ def iter_last(self, start=None, start_after=None, count=None, :param \*\*params: (optional) additional keyword args. :return: a generator object over a list of dictionaries of jobs summary for a given filter params. - :rtype: types.GeneratorType[dict] + :rtype: :class:`types.GeneratorType[dict]` Usage: @@ -341,7 +341,7 @@ def update_tags(self, add=None, remove=None, spider=None): have to specify ``spider`` param when using :attr:`Project.jobs`). :return: amount of jobs that were updated. - :rtype: int + :rtype: :class:`int` Usage: @@ -436,7 +436,7 @@ def start(self, **params): :param \*\*params: (optional) keyword meta parameters to update. :return: a previous string job state. - :rtype: str + :rtype: :class:`str` Usage:: @@ -450,7 +450,7 @@ def finish(self, **params): :param \*\*params: (optional) keyword meta parameters to update. :return: a previous string job state. - :rtype: str + :rtype: :class:`str` Usage:: @@ -464,7 +464,7 @@ def delete(self, **params): :param \*\*params: (optional) keyword meta parameters to update. :return: a previous string job state. - :rtype: str + :rtype: :class:`str` Usage:: @@ -479,7 +479,7 @@ def update(self, state, **params): :param state: a new job state. :param \*\*params: (optional) keyword meta parameters to update. :return: a previous string job state. - :rtype: str + :rtype: :class:`str` Usage:: diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index ebfdfde7..32a2b19b 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -60,7 +60,7 @@ def _modify_iter_params(self, params): :param params: an original dictionary with params. :return: a modified dictionary with params. - :rtype: dict + :rtype: :class:`dict` """ params = super(Logs, self)._modify_iter_params(params) offset = params.pop('offset', None) diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index eff3c6aa..e320acb1 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -32,7 +32,7 @@ def get(self, project_id): :param project_id: integer or string numeric project id. :return: :class:`Project` object. - :rtype: scrapinghub.client.projects.Project + :rtype: :class:`scrapinghub.client.projects.Project` Usage:: @@ -46,7 +46,7 @@ def list(self): """Get list of projects available to current user. :return: a list of project ids. - :rtype: List[int] + :rtype: :class:`list[int]` Usage:: @@ -61,7 +61,7 @@ def iter(self): Provided for the sake of API consistency. :return: an iterator over project ids list. - :rtype: collections.Iterable[int] + :rtype: :class:`collections.Iterable[int]` """ return iter(self.list()) @@ -72,7 +72,7 @@ def summary(self, state=None, **params): :return: a list of dictionaries: each dictionary represents a project summary (amount of pending/running/finished jobs and a flag if it has a capacity to run new jobs). - :rtype: List[dict] + :rtype: :class:`list[dict]` Usage:: diff --git a/scrapinghub/client/spiders.py b/scrapinghub/client/spiders.py index 8c7c6d79..b70aaca4 100644 --- a/scrapinghub/client/spiders.py +++ b/scrapinghub/client/spiders.py @@ -33,7 +33,7 @@ def get(self, spider, **params): :param spider: a string spider name. :return: :class:`Spider` object. - :rtype: scrapinghub.client.spiders.Spider + :rtype: :class:`scrapinghub.client.spiders.Spider` Usage:: @@ -52,7 +52,7 @@ def list(self): """Get a list of spiders for a project. :return: a list of dictionaries with spiders metadata. - :rtype: List[dict] + :rtype: :class:`list[dict]` Usage:: @@ -68,7 +68,7 @@ def iter(self): :return: an iterator over spiders list where each spider is represented as a dict containing its metadata. - :rtype: collection.Iterable[dict] + :rtype: :class:`collection.Iterable[dict]` Provided for the sake of API consistency. """ @@ -122,7 +122,7 @@ def list_tags(self): """List spider tags. :return: a list of spider tags. - :rtype: List[str] + :rtype: :class:`list[str]` """ path = 'v2/projects/{}/spiders/{}'.format(self.project_id, self._id) url = urljoin(self._client._connection.url, path) diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index c0ec0496..a353bc0c 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -125,7 +125,7 @@ def _modify_iter_params(self, params): :param params: a dictionary with input parameters. :return: an updated dictionary with parameters. - :rtype: dict + :rtype: :class:`dict` """ return format_iter_filters(params) @@ -185,7 +185,7 @@ def iter(self): """Iterate through key/value pairs. :return: an iterator over key/value pairs. - :rtype: collections.Iterable + :rtype: :class:`collections.Iterable` """ return six.iteritems(next(self._origin.apiget())) From 2ff6b2948bdea6fa58777f09e2a656a9edf09277 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 27 Mar 2017 19:52:17 +0300 Subject: [PATCH 17/40] Document newcount for frontiers --- scrapinghub/client/frontiers.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index be94f931..8c2b10fd 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -111,6 +111,11 @@ def list(self): @property def newcount(self): + """Amount of new entries added to all frontiers. + + :return: amount of new entries + :rtype: :class:`int` + """ return sum(self._origin.newcount.values()) @@ -184,6 +189,11 @@ def flush(self): @property def newcount(self): + """Amount of new entries added to frontier. + + :return: amount of new entries + :rtype: :class:`int` + """ newcount_values = self._frontiers._origin.newcount return sum(v for (frontier, _), v in newcount_values.items() if frontier == self.key) @@ -278,6 +288,11 @@ def flush(self): @property def newcount(self): + """Amount of new entries added to slot. + + :return: amount of new entries + :rtype: :class:`int` + """ newcount_values = self._frontier._frontiers._origin.newcount return newcount_values.get((self._frontier.key, self.key), 0) From d9b6d29d85a1d655c21476160b1018c4fa4883d1 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Mon, 27 Mar 2017 20:05:37 +0300 Subject: [PATCH 18/40] Provide missing docstrings --- scrapinghub/client/exceptions.py | 2 ++ scrapinghub/client/frontiers.py | 5 +++++ scrapinghub/client/projects.py | 5 +++++ scrapinghub/client/utils.py | 13 +++++++++++++ 4 files changed, 25 insertions(+) diff --git a/scrapinghub/client/exceptions.py b/scrapinghub/client/exceptions.py index 4c014d35..8953cd3e 100644 --- a/scrapinghub/client/exceptions.py +++ b/scrapinghub/client/exceptions.py @@ -57,6 +57,7 @@ class ServerError(ScrapinghubAPIError): def wrap_http_errors(method): + """Internal helper to handle exceptions gracefully.""" @wraps(method) def wrapped(*args, **kwargs): try: @@ -93,6 +94,7 @@ def wrapped(*args, **kwargs): def wrap_value_too_large(method): + """Internal wrapper for ValueTooLarge exception.""" @wraps(method) def wrapped(*args, **kwargs): try: diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index 8c2b10fd..4d2d57a2 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -41,6 +41,7 @@ def _get_writer(self, frontier, slot): return writer def _writer_callback(self, key, response): + """Writer callback function when new batch is added.""" self.newcount[key] += response.json()["newcount"] @@ -305,6 +306,10 @@ def __init__(self, slot): self._slot = slot def add(self, fps): + """Add new fingerprints to slot. + + :param fps: a list of string fingerprints to add. + """ origin = self._frontier._frontiers._origin writer = origin._get_writer(self._frontier.key, self.key) fps = list(fps) if not isinstance(fps, list) else fps diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index e320acb1..918d22d4 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -174,5 +174,10 @@ class Settings(_MappingProxy): >>> project.settings.delete('job_runtime_limit') """ def set(self, key, value): + """Update project setting value by key. + + :param key: a string setting key. + :param value: new setting value. + """ # FIXME drop the method when post-by-key is implemented on server side self.update({key: value}) diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index a353bc0c..3cac5f44 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -36,6 +36,12 @@ def __str__(self): def parse_project_id(project_id): + """Simple check for project id. + + :param project_id: a numeric project id, int or string. + :return: a unified project id. + :rtype: :class:`str` + """ try: int(project_id) except ValueError: @@ -44,6 +50,12 @@ def parse_project_id(project_id): def parse_job_key(job_key): + """Inner helper to parse job key. + + :param job_key: a job key (str or tuple of 3 ints). + :return: parsed job key. + :rtype: :class:`JobKey` + """ if isinstance(job_key, tuple): parts = job_key elif isinstance(job_key, six.string_types): @@ -239,6 +251,7 @@ def format_iter_filters(params): def update_kwargs(kwargs, **params): + """Update kwargs dict with non-empty params with json-encoded values.""" kwargs.update({k: json.dumps(v) if isinstance(v, dict) else v for k, v in params.items() if v is not None}) From b69cc9c4929f7ce1e776a567af41f81a0a7da748 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 10:59:10 +0300 Subject: [PATCH 19/40] Dot in the end of return section --- scrapinghub/client/frontiers.py | 6 +++--- scrapinghub/client/jobs.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index 4d2d57a2..fa49256c 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -114,7 +114,7 @@ def list(self): def newcount(self): """Amount of new entries added to all frontiers. - :return: amount of new entries + :return: amount of new entries. :rtype: :class:`int` """ return sum(self._origin.newcount.values()) @@ -192,7 +192,7 @@ def flush(self): def newcount(self): """Amount of new entries added to frontier. - :return: amount of new entries + :return: amount of new entries. :rtype: :class:`int` """ newcount_values = self._frontiers._origin.newcount @@ -291,7 +291,7 @@ def flush(self): def newcount(self): """Amount of new entries added to slot. - :return: amount of new entries + :return: amount of new entries. :rtype: :class:`int` """ newcount_values = self._frontier._frontiers._origin.newcount diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index 1c1bf784..43fa306c 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -163,7 +163,7 @@ def list(self, count=None, start=None, spider=None, state=None, field name or a list of field names to return. :param \*\*params: (optional) other filter params. - :return: list of dictionaries of jobs summary for a given filter params + :return: list of dictionaries of jobs summary for a given filter params. :rtype: :class:`list[dict]` Please note that list() method can use a lot of memory and for a large From 90911eb0d2dabf8f45c089fc0ea31290547cc1fd Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 11:10:27 +0300 Subject: [PATCH 20/40] Use autoexception in docs --- docs/client/overview.rst | 47 +++++--------------------------- scrapinghub/__init__.py | 3 +- scrapinghub/client/exceptions.py | 13 +++++---- 3 files changed, 16 insertions(+), 47 deletions(-) diff --git a/docs/client/overview.rst b/docs/client/overview.rst index 271056c1..a449f838 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -573,46 +573,13 @@ Modifying tags is available on spider/job levels. Exceptions ---------- -scrapinghub.exceptions.ScrapinghubAPIError -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Base exception class for all other exceptions listed below. - - -scrapinghub.exceptions.BadRequest -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Usually raised in case of 400 response from API. - - -scrapinghub.exceptions.Unauthorized -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Not enough access to some resources. - - -scrapinghub.exceptions.NotFound -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Entity doesn't exist (e.g. spider or project). - - -scrapinghub.exceptions.ValueTooLarge -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Value cannot be writtent because it exceeds size limits. - - -scrapinghub.exceptions.DuplicateJobError -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Job for given spider with given arguments is already scheduled or running. - - -scrapinghub.exceptions.ServerError -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Indicates some server error: something unexpected has happened. +.. autoexception:: scrapinghub.ScrapinghubAPIError +.. autoexception:: scrapinghub.BadRequest +.. autoexception:: scrapinghub.Unauthorized +.. autoexception:: scrapinghub.NotFound +.. autoexception:: scrapinghub.ValueTooLarge +.. autoexception:: scrapinghub.DuplicateJobError +.. autoexception:: scrapinghub.ServerError .. _Scrapinghub API: http://doc.scrapinghub.com/api.html diff --git a/scrapinghub/__init__.py b/scrapinghub/__init__.py index b1b67292..c1240ce7 100644 --- a/scrapinghub/__init__.py +++ b/scrapinghub/__init__.py @@ -1,7 +1,7 @@ __all__ = ["APIError", "Connection", "HubstorageClient", "ScrapinghubClient", "ScrapinghubAPIError", "DuplicateJobError", "BadRequest", "NotFound", - "Unauthorized", "ValueTooLarge"] + "Unauthorized", "ValueTooLarge", "ServerError"] import pkgutil __version__ = pkgutil.get_data(__package__, 'VERSION') @@ -19,4 +19,5 @@ NotFound, Unauthorized, ValueTooLarge, + ServerError, ) diff --git a/scrapinghub/client/exceptions.py b/scrapinghub/client/exceptions.py index 8953cd3e..d79b2eac 100644 --- a/scrapinghub/client/exceptions.py +++ b/scrapinghub/client/exceptions.py @@ -24,6 +24,7 @@ def _get_http_error_msg(exc): class ScrapinghubAPIError(Exception): + """Base exception class.""" def __init__(self, message=None, http_error=None): self.http_error = http_error @@ -33,27 +34,27 @@ def __init__(self, message=None, http_error=None): class BadRequest(ScrapinghubAPIError): - pass + """Usually raised in case of 400 response from API.""" class Unauthorized(ScrapinghubAPIError): - pass + """Request lacks valid authentication credentials for the target resource.""" class NotFound(ScrapinghubAPIError): - pass + """Entity doesn't exist (e.g. spider or project).""" class ValueTooLarge(ScrapinghubAPIError): - pass + """Value cannot be writtent because it exceeds size limits.""" class DuplicateJobError(ScrapinghubAPIError): - pass + """Job for given spider with given arguments is already scheduled or running.""" class ServerError(ScrapinghubAPIError): - pass + """Indicates some server error: something unexpected has happened.""" def wrap_http_errors(method): From 8c980efacc7b36c48d51ebf04f0ba13cf415d26a Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 11:48:43 +0300 Subject: [PATCH 21/40] Minor docstrings fixes, plain apidocs struct --- docs/client/apidocs.rst | 57 ++++++++++++++++------------------ docs/client/overview.rst | 2 +- scrapinghub/client/__init__.py | 10 +++--- scrapinghub/client/projects.py | 4 +-- 4 files changed, 36 insertions(+), 37 deletions(-) diff --git a/docs/client/apidocs.rst b/docs/client/apidocs.rst index a4d8fcb3..5e6dc8b0 100644 --- a/docs/client/apidocs.rst +++ b/docs/client/apidocs.rst @@ -1,110 +1,107 @@ API Reference ============= -Module contents ---------------- +Client object +------------- .. automodule:: scrapinghub.client :members: :undoc-members: - :show-inheritance: + :inherited-members: -Submodules ----------- - -activity +Activity -------- .. automodule:: scrapinghub.client.activity :members: :undoc-members: - :show-inheritance: + :inherited-members: -collections +Collections ----------- .. automodule:: scrapinghub.client.collections :members: :undoc-members: - :show-inheritance: + :inherited-members: -exceptions +Exceptions ---------- .. automodule:: scrapinghub.client.exceptions :members: :undoc-members: - :show-inheritance: + :inherited-members: -frontiers +Frontiers --------- .. automodule:: scrapinghub.client.frontiers :members: :undoc-members: - :show-inheritance: + :inherited-members: -items +Items ----- .. automodule:: scrapinghub.client.items :members: :undoc-members: - :show-inheritance: + :inherited-members: -jobs +Jobs ---- .. automodule:: scrapinghub.client.jobs :members: :undoc-members: - :show-inheritance: + :inherited-members: -logs +Logs ---- .. automodule:: scrapinghub.client.logs :members: :undoc-members: - :show-inheritance: + :inherited-members: -projects +Projects -------- .. automodule:: scrapinghub.client.projects :members: :undoc-members: - :show-inheritance: + :inherited-members: -requests +Requests -------- .. automodule:: scrapinghub.client.requests :members: :undoc-members: - :show-inheritance: + :inherited-members: -samples +Samples ------- .. automodule:: scrapinghub.client.samples :members: :undoc-members: - :show-inheritance: + :inherited-members: -spiders +Spiders ------- .. automodule:: scrapinghub.client.spiders :members: :undoc-members: - :show-inheritance: + :inherited-members: -utils +Utils ----- .. automodule:: scrapinghub.client.utils :members: :undoc-members: - :show-inheritance: + :inherited-members: diff --git a/docs/client/overview.rst b/docs/client/overview.rst index a449f838..c967b046 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -51,7 +51,7 @@ Project Project instance has ``jobs`` field to work with the project jobs. -Jobs instance is described well in ``Jobs`` section below. +Jobs instance is described well in :ref:`Jobs ` section below. For example, to schedule a spider run (it returns a job object):: diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index 8c47f30e..ea2e107b 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -33,7 +33,8 @@ class ScrapinghubClient(object): :param \*\*kwargs: (optional) Additional arguments for :class:`scrapinghub.hubstorage.HubstorageClient` constructor. - :ivar projects: projects collection, :class:`Projects` instance. + :ivar projects: projects collection, + :class:`scrapinghub.client.projects.Projects` instance. Usage:: @@ -52,12 +53,13 @@ def __init__(self, auth=None, dash_endpoint=None, **kwargs): self._hsclient = HubstorageClient(auth=(login, password), **kwargs) def get_project(self, project_id): - """Get :class:`Project` instance with a given project id. + """Get :class:`scrapinghub.client.projects.Project` instance with + a given project id. The method is a shortcut for client.projects.get(). :param project_id: integer or string numeric project id. - :return: :class:`Project` object. + :return: a project instance. :rtype: :class:`scrapinghub.client.projects.Project` Usage:: @@ -73,7 +75,7 @@ def get_job(self, job_key): :param job_key: job key string in format 'project_id/spider_id/job_id', where all the components are integers. - :return: :class:`Job` object. + :return: a job instance. :rtype: :class:`scrapinghub.client.jobs.Job` Usage:: diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index 918d22d4..c25e3269 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -15,7 +15,7 @@ class Projects(object): """Collection of projects available to current user. - Not a public constructor: use :class:`Scrapinghub` client instance to get + Not a public constructor: use :class:`ScrapinghubClient` client instance to get a :class:`Projects` instance. See :attr:`Scrapinghub.projects` attribute. Usage:: @@ -98,7 +98,7 @@ class Project(object): Not a public constructor: use :class:`ScrapinghubClient` instance or :class:`Projects` instance to get a :class:`Project` instance. See - :meth:`Scrapinghub.get_project` or :meth:`Projects.get` methods. + :meth:`ScrapinghubClient.get_project` or :meth:`Projects.get` methods. :ivar key: string project id. :ivar activity: :class:`Activity` resource object. From 7ea58a6c26d39460e5c92cd84ddcf22f7ba896ee Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 12:47:18 +0300 Subject: [PATCH 22/40] Sphinx markup fixes --- docs/client/apidocs.rst | 10 --------- scrapinghub/client/__init__.py | 8 +++---- scrapinghub/client/activity.py | 9 ++++---- scrapinghub/client/collections.py | 26 +++++++++++----------- scrapinghub/client/frontiers.py | 15 +++++++------ scrapinghub/client/items.py | 9 ++++---- scrapinghub/client/jobs.py | 36 ++++++++++++++++--------------- scrapinghub/client/logs.py | 9 ++++---- scrapinghub/client/projects.py | 28 +++++++++++++----------- scrapinghub/client/requests.py | 9 ++++---- scrapinghub/client/samples.py | 9 ++++---- scrapinghub/client/spiders.py | 9 ++++---- scrapinghub/client/utils.py | 18 +++++++++------- 13 files changed, 100 insertions(+), 95 deletions(-) diff --git a/docs/client/apidocs.rst b/docs/client/apidocs.rst index 5e6dc8b0..e83f5c0f 100644 --- a/docs/client/apidocs.rst +++ b/docs/client/apidocs.rst @@ -9,7 +9,6 @@ Client object :undoc-members: :inherited-members: - Activity -------- @@ -32,7 +31,6 @@ Exceptions .. automodule:: scrapinghub.client.exceptions :members: :undoc-members: - :inherited-members: Frontiers --------- @@ -97,11 +95,3 @@ Spiders :members: :undoc-members: :inherited-members: - -Utils ------ - -.. automodule:: scrapinghub.client.utils - :members: - :undoc-members: - :inherited-members: diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index ea2e107b..a751075c 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -31,10 +31,10 @@ class ScrapinghubClient(object): :param auth: Scrapinghub APIKEY or other SH auth credentials. :param dash_endpoint: (optional) Scrapinghub Dash panel url. :param \*\*kwargs: (optional) Additional arguments for - :class:`scrapinghub.hubstorage.HubstorageClient` constructor. + :class:`~scrapinghub.hubstorage.HubstorageClient` constructor. :ivar projects: projects collection, - :class:`scrapinghub.client.projects.Projects` instance. + :class:`~scrapinghub.client.projects.Projects` instance. Usage:: @@ -60,7 +60,7 @@ def get_project(self, project_id): :param project_id: integer or string numeric project id. :return: a project instance. - :rtype: :class:`scrapinghub.client.projects.Project` + :rtype: :class:`~scrapinghub.client.projects.Project` Usage:: @@ -76,7 +76,7 @@ def get_job(self, job_key): :param job_key: job key string in format 'project_id/spider_id/job_id', where all the components are integers. :return: a job instance. - :rtype: :class:`scrapinghub.client.jobs.Job` + :rtype: :class:`~scrapinghub.client.jobs.Job` Usage:: diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 40f02fc1..6367f389 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -7,11 +7,12 @@ class Activity(_Proxy): """Representation of collection of job activity events. - Not a public constructor: use :class:`Project` instance to get a - :class:`Activity` instance. See :attr:`Project.activity` attribute. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance to get a :class:`~scrapinghub.client.activity.Activity` instance. + See :attr:`~scrapinghub.client.projects.Project.activity` attribute. - Please note that list() method can use a lot of memory and for a large - amount of activities it's recommended to iterate through it via iter() + Please note that ``list()`` method can use a lot of memory and for a large + amount of activities it's recommended to iterate through it via ``iter()`` method (all params and available filters are same for both methods). Usage: diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index f2a95c38..a7a0fb87 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -13,8 +13,9 @@ class Collections(_Proxy): """Access to project collections. - Not a public constructor: use :class:`Project` instance to get a - :class:`Collections` instance. See :attr:`Project.collections` attribute. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance to get a :class:`Collections` instance. + See :attr:`~scrapinghub.client.projects.Project.collections` attribute. Usage:: @@ -29,7 +30,7 @@ def get(self, type_, name): :param type_: a collection type string. :param name: a collection name string. - :return: :class:`Collection` object. + :return: a collection object. :rtype: :class:`Collection` """ self._origin._validate_collection(type_, name) @@ -39,7 +40,7 @@ def get_store(self, name): """Method to get a store collection by name. :param name: a collection name string. - :return: :class:`Collection` object. + :return: a collection object. :rtype: :class:`Collection` """ return self.get('s', name) @@ -50,7 +51,7 @@ def get_cached_store(self, name): The collection type means that items expire after a month. :param name: a collection name string. - :return: :class:`Collection` object. + :return: a collection object. :rtype: :class:`Collection` """ return self.get('cs', name) @@ -61,7 +62,7 @@ def get_versioned_store(self, name): The collection type retains up to 3 copies of each item. :param name: a collection name string. - :return: :class:`Collection` object. + :return: a collection object. :rtype: :class:`Collection` """ return self.get('vs', name) @@ -72,7 +73,7 @@ def get_versioned_cached_store(self, name): Multiple copies are retained, and each one expires after a month. :param name: a collection name string. - :return: :class:`Collection` object. + :return: a collection object. :rtype: :class:`Collection` """ return self.get('vcs', name) @@ -159,9 +160,10 @@ def list(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): """Convenient shortcut to list iter results. - Please note that list() method can use a lot of memory and for a large - amount of elements it's recommended to iterate through it via iter() - method (all params and available filters are same for both methods). + Please note that ``list()`` method can use a lot of memory and for a + large amount of elements it's recommended to iterate through it via + ``iter()`` method (all params and available filters are same for both + methods). :param key: a string key or a list of keys to filter with. :param prefix: a string prefix to filter items. @@ -197,7 +199,7 @@ def set(self, value): :param value: a dict representing a collection item. - The method returns None (original method returns an empty generator). + The method returns ``None`` (original method returns an empty generator). """ self._origin.set(value) @@ -206,7 +208,7 @@ def delete(self, keys): :param keys: a single key or a list of keys. - The method returns None (original method returns an empty generator). + The method returns ``None`` (original method returns an empty generator). """ if (not isinstance(keys, string_types) and not isinstance(keys, collections.Iterable)): diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index fa49256c..052774ea 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -22,7 +22,7 @@ def _get_writer(self, frontier, slot): callback to write newcount data per slot. :return: a batchuploader writer instance. - :rtype: :class:`scrapinghub.hubstorage.batchuploader._BatchWriter` + :rtype: :class:`~scrapinghub.hubstorage.batchuploader._BatchWriter` """ key = (frontier, slot) writer = self._writers.get(key) @@ -48,8 +48,9 @@ def _writer_callback(self, key, response): class Frontiers(_Proxy): """Frontiers collection for a project. - Not a public constructor: use :class:`Project` instance to get a - :class:`Frontiers` instance. See :attr:`Project.frontiers` attribute. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance to get a :class:`Frontiers` instance. + See :attr:`~scrapinghub.client.Project.frontiers` attribute. Usage: @@ -89,7 +90,7 @@ def get(self, name): """Get a frontier by name. :param name: a frontier name string. - :return: :class:`Frontier` instance. + :return: a frontier instance. :rtype: :class:`Frontier` """ return Frontier(self._client, self, name) @@ -160,7 +161,7 @@ def __init__(self, client, frontiers, name): def get(self, slot): """Get a slot by name. - :return: :class:`FrontierSlot` instance. + :return: a frontier slot instance. :rtype: :class:`FrontierSlot` """ return FrontierSlot(self._client, self, slot) @@ -260,7 +261,7 @@ def __init__(self, client, frontier, slot): def f(self): """Shortcut to have quick access to slot fingerprints. - :return: :class:`FrontierSlotFingerprints` instance. + :return: fingerprints collection for the slot. :rtype: :class:`FrontierSlotFingerprints` """ return self.fingerprints @@ -269,7 +270,7 @@ def f(self): def q(self): """Shortcut to have quick access to a slot queue. - :return: :class:`FrontierSlotQueue` instance. + :return: queue instance for the slot. :rtype: :class:`FrontierSlotQueue` """ return self.queue diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index e8f4caa2..49c3e49c 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -6,11 +6,12 @@ class Items(_Proxy): """Representation of collection of job items. - Not a public constructor: use :class:`Job` instance to get a :class:`Items` - instance. See :attr:`Job.items` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instanc + e to get a :class:`Items` instance. + See :attr:`~scrapinghub.client.jobs.Job.items` attribute. - Please note that list() method can use a lot of memory and for a large - amount of items it's recommended to iterate through it via iter() method + Please note that ``list()`` method can use a lot of memory and for a large + amount of items it's recommended to iterate through it via ``iter()`` method (all params and available filters are same for both methods). Usage: diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index 43fa306c..fa13fa17 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -19,9 +19,10 @@ class Jobs(object): """Class representing a collection of jobs for a project/spider. - Not a public constructor: use :class:`Project` instance or :class:`Spider` - instance to get a :class:`Jobs` instance. See :attr:`Project.jobs` and - :attr:`Spider.jobs` attributes. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance or :class:`~scrapinghub.client.spiders.Spider` instance to get + a :class:`Jobs` instance. See :attr:`scrapinghub.client.projects.Project.jobs` + and :attr:`scrapinghub.client.spiders.Spider.jobs` attributes. :ivar project_id: a string project id. :ivar spider: :class:`Spider` object if defined. @@ -235,7 +236,7 @@ def get(self, job_key): :class:`Jobs` instance, and job_key's spider component should match the spider (if :attr:`Spider.jobs` was used). - :return: :class:`Job` object. + :return: a job object. :rtype: :class:`Job` Usage:: @@ -255,8 +256,8 @@ def summary(self, state=None, spider=None, **params): """Get jobs summary (optionally by state). :param state: (optional) a string state to filter jobs. - :param spider: (optional) a spider name - (not needed if instantiated with :class:`Spider`). + :param spider: (optional) a spider name (not needed if instantiated + with :class:`~scrapinghub.client.spiders.Spider`). :param \*\*params: (optional) additional keyword args. :return: a list of dictionaries of jobs summary for a given filter params grouped by job state. @@ -283,8 +284,8 @@ def iter_last(self, start=None, start_after=None, count=None, :param start: (optional) :param start_after: (optional) :param count: (optional) - :param spider: (optional) a spider name - (not needed if instantiated with :class:`Spider`). + :param spider: (optional) a spider name (not needed if instantiated + with :class:`~scrapinghub.client.spiders.Spider`). :param \*\*params: (optional) additional keyword args. :return: a generator object over a list of dictionaries of jobs summary for a given filter params. @@ -371,17 +372,18 @@ def update_tags(self, add=None, remove=None, spider=None): class Job(object): """Class representing a job object. - Not a public constructor: use :class:`ScrapinghubClient` instance or - :class:`Jobs` instance to get a :class:`Job` instance. See - :meth:`ScrapinghubClient.get_job` and :meth:`Jobs.get` methods. + Not a public constructor: use :class:`~scrapinghub.client.ScrapinghubClient` + instance or :class:`Jobs` instance to get a :class:`Job` instance. See + :meth:`scrapinghub.client.ScrapinghubClient.get_job` and :meth:`Jobs.get` + methods. :ivar project_id: integer project id. :ivar key: a job key. - :ivar items: :class:`Items` resource object. - :ivar logs: :class:`Logs` resource object. - :ivar requests: :class:`Requests` resource object. - :ivar samples: :class:`Samples` resource object. - :ivar metadata: :class:`Metadata` resource. + :ivar items: :class:`~scrapinghub.client.items.Items` resource object. + :ivar logs: :class:`~scrapinghub.client.logs.Logs` resource object. + :ivar requests: :class:`~scrapinghub.client.requests.Requests` resource object. + :ivar samples: :class:`~scrapinghub.client.samples.Samples` resource object. + :ivar metadata: :class:`JobMeta` resource object. Usage:: @@ -508,7 +510,7 @@ class JobMeta(_MappingProxy): """Class representing job metadata. Not a public constructor: use :class:`Job` instance to get a - :class:`Jobmeta` instance. See :attr:`Job.metadata` attribute. + :class:`JobMeta` instance. See :attr:`Job.metadata` attribute. Usage: diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 32a2b19b..15a70038 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -8,11 +8,12 @@ class Logs(_Proxy): """Representation of collection of job logs. - Not a public constructor: use :class:`Job` instance to get a :class:`Logs` - instance. See :attr:`Job.logs` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance + to get a :class:`Logs` instance. See :attr:`~scrapinghub.client.jobs.Job.logs` + attribute. - Please note that list() method can use a lot of memory and for a large - amount of logs it's recommended to iterate through it via iter() method + Please note that ``list()`` method can use a lot of memory and for a large + amount of logs it's recommended to iterate through it via ``iter()`` method (all params and available filters are same for both methods). Usage: diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index c25e3269..cbb3e33f 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -15,8 +15,9 @@ class Projects(object): """Collection of projects available to current user. - Not a public constructor: use :class:`ScrapinghubClient` client instance to get - a :class:`Projects` instance. See :attr:`Scrapinghub.projects` attribute. + Not a public constructor: use :class:`~scrapinghub.client.ScrapinghubClient` + client instance to get a :class:`Projects` instance. + See :attr:`scrapinghub.client.Scrapinghub.projects` attribute. Usage:: @@ -31,8 +32,8 @@ def get(self, project_id): """Get project for a given project id. :param project_id: integer or string numeric project id. - :return: :class:`Project` object. - :rtype: :class:`scrapinghub.client.projects.Project` + :return: a project object. + :rtype: :class:`Project` Usage:: @@ -96,17 +97,18 @@ def summary(self, state=None, **params): class Project(object): """Class representing a project object and its resources. - Not a public constructor: use :class:`ScrapinghubClient` instance or - :class:`Projects` instance to get a :class:`Project` instance. See - :meth:`ScrapinghubClient.get_project` or :meth:`Projects.get` methods. + Not a public constructor: use :class:`~scrapinghub.client.ScrapinghubClient` + instance or :class:`Projects` instance to get a :class:`Project` instance. + See :meth:`scrapinghub.client.ScrapinghubClient.get_project` or + :meth:`Projects.get` methods. :ivar key: string project id. - :ivar activity: :class:`Activity` resource object. - :ivar collections: :class:`Collections` resource object. - :ivar frontiers: :class:`Frontiers` resource object. - :ivar jobs: :class:`Jobs` resource object. - :ivar settings: :class:`Settings` resource object. - :ivar spiders: :class:`Spiders` resource object. + :ivar activity: :class:`~scrapinghub.client.activity.Activity` resource object. + :ivar collections: :class:`~scrapinghub.client.collections.Collections` resource object. + :ivar frontiers: :class:`~scrapinghub.client.frontiers.Frontiers` resource object. + :ivar jobs: :class:`~scrapinghub.client.jobs.Jobs` resource object. + :ivar settings: :class:`~scrapinghub.client.settings.Settings` resource object. + :ivar spiders: :class:`~scrapinghub.client.spiders.Spiders` resource object. Usage:: diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 06ee1125..82777fdb 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -6,11 +6,12 @@ class Requests(_Proxy): """Representation of collection of job requests. - Not a public constructor: use :class:`Job` instance to get a - :class:`Requests` instance. See :attr:`Job.requests` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance + to get a :class:`Requests` instance. + See :attr:`~scrapinghub.client.jobs.Job.requests` attribute. - Please note that list() method can use a lot of memory and for a large - amount of requests it's recommended to iterate through it via iter() + Please note that ``list()`` method can use a lot of memory and for a large + amount of requests it's recommended to iterate through it via ``iter()`` method (all params and available filters are same for both methods). Usage: diff --git a/scrapinghub/client/samples.py b/scrapinghub/client/samples.py index 581d0fd7..268bbbe7 100644 --- a/scrapinghub/client/samples.py +++ b/scrapinghub/client/samples.py @@ -6,11 +6,12 @@ class Samples(_Proxy): """Representation of collection of job samples. - Not a public constructor: use :class:`Job` instance to get a - :class:`Samples` instance. See :attr:`Job.samples` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance + to get a :class:`Samples` instance. + See :attr:`~scrapinghub.client.jobs.Job.samples` attribute. - Please note that list() method can use a lot of memory and for a large - amount of samples it's recommended to iterate through it via iter() + Please note that ``list()`` method can use a lot of memory and for a large + amount of samples it's recommended to iterate through it via ``iter()`` method (all params and available filters are same for both methods). Usage: diff --git a/scrapinghub/client/spiders.py b/scrapinghub/client/spiders.py index b70aaca4..1d8e4e81 100644 --- a/scrapinghub/client/spiders.py +++ b/scrapinghub/client/spiders.py @@ -11,8 +11,9 @@ class Spiders(object): """Class to work with a collection of project spiders. - Not a public constructor: use :class:`Project` instance to get - a :class:`Spiders` instance. See :attr:`Project.spiders` attribute. + Not a public constructor: use :class:`~scrapinghub.client.projects.Project` + instance to get a :class:`Spiders` instance. + See :attr:`~scrapinghub.client.projects.Project.spiders` attribute. :ivar project_id: string project id. @@ -32,7 +33,7 @@ def get(self, spider, **params): The method gets/sets spider id (and checks if spider exists). :param spider: a string spider name. - :return: :class:`Spider` object. + :return: a spider object. :rtype: :class:`scrapinghub.client.spiders.Spider` Usage:: @@ -84,7 +85,7 @@ class Spider(object): :ivar project_id: a string project id. :ivar key: a string key in format 'project_id/spider_id'. :ivar name: a spider name string. - :ivar jobs: a collection of jobs, :class:`Jobs` object. + :ivar jobs: a collection of jobs, :class:`~scrapinghub.client.jobs.Jobs` object. Usage:: diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index 3cac5f44..a079befc 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -90,12 +90,13 @@ class _Proxy(object): The internal proxy class is useful to link class attributes from its origin depending on the origin base class as a part of init logic: - - :class:`ItemsResourceType` provides items-based attributes to access - items in an arbitrary collection with get/write/flush/close/stats/ - iter methods. + - :class:`~scrapinghub.hubstorage.resourcetype.ItemsResourceType` provides + items-based attributes to access items in an arbitrary collection with + get/write/flush/close/stats/iter methods. - - :class:`DownloadableResource` provides download-based attributes to - iter through collection with or without msgpack support. + - :class:`~scrapinghub.hubstorage.resourcetype.DownloadableResource` provides + download-based attributes to iter through collection with or without + msgpack support. """ def __init__(self, cls, client, key): @@ -144,9 +145,10 @@ def _modify_iter_params(self, params): def list(self, *args, **kwargs): """Convenient shortcut to list iter results. - Please note that list() method can use a lot of memory and for a large - amount of elements it's recommended to iterate through it via iter() - method (all params and available filters are same for both methods). + Please note that ``list()`` method can use a lot of memory and for a + large amount of elements it's recommended to iterate through it via + ``iter()`` method (all params and available filters are same for both + methods). """ return list(self.iter(*args, **kwargs)) From 7a505c772a341c4e3e8d61be1f919579b210a292 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 13:49:23 +0300 Subject: [PATCH 23/40] Provide links to API in Overview --- docs/client/overview.rst | 78 ++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/docs/client/overview.rst b/docs/client/overview.rst index c967b046..2b32314f 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -1,9 +1,11 @@ Overview ======== -The ``scrapinghub.ScrapinghubClient`` is a new Python client for communicating -with the `Scrapinghub API`_. It takes best from ``scrapinghub.Connection`` and -``scrapinghub.HubstorageClient`` and combines it under single interface. +The :class:`~scrapinghub.client.ScrapinghubClient` is a new Python client for +communicating with the `Scrapinghub API`_. +It takes best from :class:`~scrapinghub.legacy.Connection` and +:class:`~scrapinghub.hubstorage.HubstorageClient`, and combines it under single +interface. First, you instantiate new client:: @@ -12,7 +14,8 @@ First, you instantiate new client:: >>> client -Client instance has ``projects`` field for access to client projects. +Client instance has :attr:`~scrapinghub.client.ScrapinghubClient.projects` field +for access to client projects. Projects -------- @@ -46,32 +49,29 @@ And select a particular project to work with:: .. tip:: The above is a shortcut for ``client.projects.get(123)``. + Project ------- -Project instance has ``jobs`` field to work with the project jobs. +:class:`~scrapinghub.client.projects.Project` instance has +:attr:`~scrapinghub.client.projects.Project.jobs` field to work with +the project jobs. -Jobs instance is described well in :ref:`Jobs ` section below. +:class:`~scrapinghub.client.jobs.Jobs` instance is described well in +:ref:`Jobs ` section below. -For example, to schedule a spider run (it returns a job object):: +For example, to schedule a spider run (it returns a +:class:`~scrapinghub.client.jobs.Job` object):: >>> project.jobs.run('spider1', job_args={'arg1': 'val1'}) > -Project instance also has the following fields: - -- **activity** - access to :ref:`project activity ` records -- **collections** - work with :ref:`project collections ` -- **frontiers** - using :ref:`project frontiers ` -- **settings** - interface to :ref:`project settings ` -- **spiders** - access to :ref:`spiders collection ` - - -.. _project-settings: Settings -------- +You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. + To get a list of the project settings:: >>> project.settings.list() @@ -91,11 +91,12 @@ Or update a few project settings at once:: >>> project.settings.update({'default_job_units': 1, ... 'job_runtime_limit': 20}) -.. _project-spiders: Spiders ------- +Spiders collection is accessible via :class:`~scrapinghub.client.spiders.Spiders`. + To get the list of spiders of the project:: >>> project.spiders.list() @@ -119,7 +120,8 @@ To select a particular spider to work with:: Spider ------ -Like project instance, spider instance has ``jobs`` field to work with the spider's jobs. +Like project instance, :class:`~scrapinghub.client.spiders.Spider` instance has +``jobs`` field to work with the spider's jobs. To schedule a spider run:: @@ -133,7 +135,8 @@ Note that you don't need to specify spider name explicitly. Jobs ---- -Jobs collection is available on project/spider level. +:class:`~scrapinghub.client.jobs.Jobs` collection is available on project/spider +level. get ^^^ @@ -290,7 +293,8 @@ Note that there can be a lot of spiders, so the method above returns an iterator Job --- -Job instance provides access to a job data with the following fields: +:class:`~scrapinghub.client.jobs.Job` instance provides access to a job data +with the following fields: - metadata - items @@ -311,7 +315,8 @@ To delete a job:: Metadata ^^^^^^^^ -Job details can be found in jobs metadata and it's scrapystats:: +:class:`~scrapinghub.client.jobs.JobMeta` details can be found in jobs metadata +and it's scrapystats:: >>> job.metadata.get('version') '5123a86-master' @@ -338,7 +343,8 @@ Anything can be stored in metadata, here is example how to add tags:: Items ^^^^^ -To retrieve all scraped items from a job:: +To retrieve all scraped items from a job use +:class:`~scrapinghub.client.items.Items`:: >>> for item in job.items.iter(): ... # do something with item (it's just a dict) @@ -348,7 +354,7 @@ To retrieve all scraped items from a job:: Logs ^^^^ -To retrieve all log entries from a job:: +To retrieve all log entries from a job use :class:`~scrapinghub.client.logs.Logs`:: >>> for logitem in job.logs.iter(): ... # logitem is a dict with level, message, time @@ -364,7 +370,7 @@ To retrieve all log entries from a job:: Requests ^^^^^^^^ -To retrieve all requests from a job:: +To retrieve all requests from a job there's :class:`~scrapinghub.client.requests.Requests`:: >>> for reqitem in job.requests.iter(): ... # reqitem is a dict @@ -384,18 +390,21 @@ To retrieve all requests from a job:: Samples ^^^^^^^ -To retrieve all samples for a job:: +:class:`~scrapinghub.client.samples.Samples` is useful to retrieve all samples +for a job:: >>> for sample in job.samples.iter(): ... # sample is a list with a timestamp and data >>> sample [1482233732452, 0, 0, 0, 0, 0] -.. _project-activity: Activity -------- +:class:`~scrapinghub.client.activity.Activity` provides a convenient interface +to project activity events. + To retrieve all activity events from a project:: >>> project.activity.iter() @@ -419,14 +428,13 @@ Or post multiple events at once:: >>> project.activity.add(events) -.. _project-collections: Collections ----------- As an example, let's store hash and timestamp pair for foo spider. -Usual workflow with `Collections`_ would be:: +Usual workflow with :class:`~scrapinghub.client.collections.Collections` would be:: >>> collections = project.collections >>> foo_store = collections.get_store('foo_store') @@ -447,12 +455,11 @@ Usual workflow with `Collections`_ would be:: Collections are available on project level only. -.. _project-frontiers: Frontiers --------- -Typical workflow with `Frontier`_:: +Typical workflow with :class:`~scrapinghub.client.frontiers.Frontiers`:: >>> frontiers = project.frontiers @@ -466,7 +473,7 @@ List all frontiers:: >>> frontiers.list() ['test', 'test1', 'test2'] -Get a frontier by name:: +Get a :class:`~scrapinghub.client.frontiers.Frontier` instance by name:: >>> frontier = frontiers.get('test') >>> frontier @@ -482,7 +489,7 @@ List all slots:: >>> frontier.list() ['example.com', 'example.com2'] -Get a frontier slot by name:: +Get a :class:`~scrapinghub.client.frontiers.FrontierSlot` by name:: >>> slot = frontier.get('example.com') >>> slot @@ -507,7 +514,9 @@ Add a fingerprint only to the slot:: >>> slot.fingerprints.add(['fp1', 'fp2']) >>> slot.flush() -There are convenient shortcuts: ``f`` for ``fingerprints`` and ``q`` for ``queue``. +There are convenient shortcuts: ``f`` for ``fingerprints`` to access +:class:`~scrapinghub.client.frontiers.FrontierSlotFingerprints` and ``q`` for +``queue`` to access :class:`~scrapinghub.client.frontiers.FrontierSlotQueue`. Add requests with additional parameters:: @@ -583,7 +592,6 @@ Exceptions .. _Scrapinghub API: http://doc.scrapinghub.com/api.html -.. _Collections: http://doc.scrapinghub.com/api/collections.html .. _Frontier: http://doc.scrapinghub.com/api/frontier.html .. _count endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-count .. _list endpoint: https://doc.scrapinghub.com/api/jobq.html#jobq-project-id-list From 94906f9c59b983300a3fa4abd5ef32716dcd66f1 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 14:55:30 +0300 Subject: [PATCH 24/40] Fix links for list/iter warning --- scrapinghub/client/activity.py | 4 ++-- scrapinghub/client/collections.py | 8 ++++---- scrapinghub/client/items.py | 7 ++++--- scrapinghub/client/jobs.py | 4 ++-- scrapinghub/client/logs.py | 6 +++--- scrapinghub/client/requests.py | 7 ++++--- scrapinghub/client/samples.py | 7 ++++--- scrapinghub/client/utils.py | 4 ++-- 8 files changed, 25 insertions(+), 22 deletions(-) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 6367f389..bb492fed 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -11,8 +11,8 @@ class Activity(_Proxy): instance to get a :class:`~scrapinghub.client.activity.Activity` instance. See :attr:`~scrapinghub.client.projects.Project.activity` attribute. - Please note that ``list()`` method can use a lot of memory and for a large - amount of activities it's recommended to iterate through it via ``iter()`` + Please note that :meth:`list` method can use a lot of memory and for a large + amount of activities it's recommended to iterate through it via :meth:`iter` method (all params and available filters are same for both methods). Usage: diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index a7a0fb87..c135bd39 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -160,10 +160,10 @@ def list(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): """Convenient shortcut to list iter results. - Please note that ``list()`` method can use a lot of memory and for a - large amount of elements it's recommended to iterate through it via - ``iter()`` method (all params and available filters are same for both - methods). + Please note that :meth:`list` method can use a lot of memory and for a + large amount of logs it's recommended to iterate through it + via :meth:`iter` method (all params and available filters are same for + both methods). :param key: a string key or a list of keys to filter with. :param prefix: a string prefix to filter items. diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 49c3e49c..45329e17 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -10,9 +10,10 @@ class Items(_Proxy): e to get a :class:`Items` instance. See :attr:`~scrapinghub.client.jobs.Job.items` attribute. - Please note that ``list()`` method can use a lot of memory and for a large - amount of items it's recommended to iterate through it via ``iter()`` method - (all params and available filters are same for both methods). + Please note that :meth:`list` method can use a lot of memory and for + a large amount of logs it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for + both methods). Usage: diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index fa13fa17..b72feed2 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -167,8 +167,8 @@ def list(self, count=None, start=None, spider=None, state=None, :return: list of dictionaries of jobs summary for a given filter params. :rtype: :class:`list[dict]` - Please note that list() method can use a lot of memory and for a large - amount of jobs it's recommended to iterate through it via iter() + Please note that :meth:`list` can use a lot of memory and for a large + amount of logs it's recommended to iterate through it via :meth:`iter` method (all params and available filters are same for both methods). """ # FIXME we double-check the params here, is there a better way? diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 15a70038..6771607c 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -12,9 +12,9 @@ class Logs(_Proxy): to get a :class:`Logs` instance. See :attr:`~scrapinghub.client.jobs.Job.logs` attribute. - Please note that ``list()`` method can use a lot of memory and for a large - amount of logs it's recommended to iterate through it via ``iter()`` method - (all params and available filters are same for both methods). + Please note that :meth:`list` method can use a lot of memory and for a + large amount of logs it's recommended to iterate through it via :meth:`iter` + method (all params and available filters are same for both methods). Usage: diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 82777fdb..e07cd36e 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -10,9 +10,10 @@ class Requests(_Proxy): to get a :class:`Requests` instance. See :attr:`~scrapinghub.client.jobs.Job.requests` attribute. - Please note that ``list()`` method can use a lot of memory and for a large - amount of requests it's recommended to iterate through it via ``iter()`` - method (all params and available filters are same for both methods). + Please note that :meth:`list` method can use a lot of memory and for + a large amount of logs it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for + both methods). Usage: diff --git a/scrapinghub/client/samples.py b/scrapinghub/client/samples.py index 268bbbe7..828677d2 100644 --- a/scrapinghub/client/samples.py +++ b/scrapinghub/client/samples.py @@ -10,9 +10,10 @@ class Samples(_Proxy): to get a :class:`Samples` instance. See :attr:`~scrapinghub.client.jobs.Job.samples` attribute. - Please note that ``list()`` method can use a lot of memory and for a large - amount of samples it's recommended to iterate through it via ``iter()`` - method (all params and available filters are same for both methods). + Please note that :meth:`list` method can use a lot of memory and for + a large amount of logs it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for + both methods). Usage: diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index a079befc..405eafde 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -145,9 +145,9 @@ def _modify_iter_params(self, params): def list(self, *args, **kwargs): """Convenient shortcut to list iter results. - Please note that ``list()`` method can use a lot of memory and for a + Please note that :meth:`list` method can use a lot of memory and for a large amount of elements it's recommended to iterate through it via - ``iter()`` method (all params and available filters are same for both + :meth:`iter` method (all params and available filters are same for both methods). """ return list(self.iter(*args, **kwargs)) From f7cafbb5cc2e9d8ee1e21326dc137c262ad8db55 Mon Sep 17 00:00:00 2001 From: Alexander Chekunkov Date: Tue, 28 Mar 2017 13:23:49 +0100 Subject: [PATCH 25/40] Disable cross-referencing for ivars http://stackoverflow.com/a/41184353/1932023 --- docs/conf.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 30c7e1d8..c1a37395 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,6 +20,10 @@ import sys from datetime import datetime +from docutils import nodes +from sphinx.util.docfields import TypedField +from sphinx import addnodes + sys.path.insert(0, os.path.abspath('..')) @@ -176,3 +180,43 @@ html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # otherwise, readthedocs.org uses their theme by default, no need to specify it + +# disable cross-reference for ivar +# patch taken from http://stackoverflow.com/a/41184353/1932023 +def patched_make_field(self, types, domain, items): + # type: (List, unicode, Tuple) -> nodes.field + def handle_item(fieldarg, content): + par = nodes.paragraph() + par += addnodes.literal_strong('', fieldarg) # Patch: this line added + # par.extend(self.make_xrefs(self.rolename, domain, fieldarg, + # addnodes.literal_strong)) + if fieldarg in types: + par += nodes.Text(' (') + # NOTE: using .pop() here to prevent a single type node to be + # inserted twice into the doctree, which leads to + # inconsistencies later when references are resolved + fieldtype = types.pop(fieldarg) + if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text): + typename = u''.join(n.astext() for n in fieldtype) + par.extend(self.make_xrefs(self.typerolename, domain, typename, + addnodes.literal_emphasis)) + else: + par += fieldtype + par += nodes.Text(')') + par += nodes.Text(' -- ') + par += content + return par + + fieldname = nodes.field_name('', self.label) + if len(items) == 1 and self.can_collapse: + fieldarg, content = items[0] + bodynode = handle_item(fieldarg, content) + else: + bodynode = self.list_type() + for fieldarg, content in items: + bodynode += nodes.list_item('', handle_item(fieldarg, content)) + fieldbody = nodes.field_body('', bodynode) + return nodes.field('', fieldname, fieldbody) + + +TypedField.make_field = patched_make_field From 8764f745ad93bce67315c405bcab98eef191333a Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 15:16:55 +0300 Subject: [PATCH 26/40] Move proxy logic into a separate module --- scrapinghub/client/activity.py | 2 +- scrapinghub/client/collections.py | 5 +- scrapinghub/client/frontiers.py | 3 +- scrapinghub/client/items.py | 2 +- scrapinghub/client/jobs.py | 5 +- scrapinghub/client/logs.py | 2 +- scrapinghub/client/projects.py | 3 +- scrapinghub/client/proxy.py | 178 ++++++++++++++++++++++++++++++ scrapinghub/client/requests.py | 2 +- scrapinghub/client/samples.py | 2 +- scrapinghub/client/utils.py | 174 ----------------------------- tests/client/test_utils.py | 2 +- 12 files changed, 192 insertions(+), 188 deletions(-) create mode 100644 scrapinghub/client/proxy.py diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index bb492fed..b888a6e9 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _Proxy from .utils import parse_job_key diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index c135bd39..868fdc5d 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -5,9 +5,8 @@ from ..hubstorage.collectionsrt import Collection as _Collection -from .utils import ( - _Proxy, format_iter_filters, proxy_methods, wrap_kwargs, update_kwargs, -) +from .proxy import _Proxy, proxy_methods, wrap_kwargs, format_iter_filters +from .utils import update_kwargs class Collections(_Proxy): diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index 052774ea..bf2ae721 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -7,7 +7,8 @@ from ..hubstorage.frontier import Frontier as _Frontier from ..hubstorage.utils import urlpathjoin -from .utils import _Proxy, update_kwargs +from .proxy import _Proxy +from .utils import update_kwargs class _HSFrontier(_Frontier): diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 45329e17..56069344 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _Proxy class Items(_Proxy): diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index b72feed2..a022e85e 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -11,9 +11,8 @@ from .requests import Requests from .samples import Samples from .exceptions import NotFound, BadRequest, DuplicateJobError -from .utils import ( - _MappingProxy, get_tags_for_update, parse_job_key, update_kwargs, -) +from .proxy import _MappingProxy +from .utils import get_tags_for_update, parse_job_key, update_kwargs class Jobs(object): diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 6771607c..61efecce 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -1,7 +1,7 @@ from __future__ import absolute_import import json -from .utils import _Proxy +from .proxy import _Proxy from .utils import LogLevel diff --git a/scrapinghub/client/projects.py b/scrapinghub/client/projects.py index cbb3e33f..35f93eae 100644 --- a/scrapinghub/client/projects.py +++ b/scrapinghub/client/projects.py @@ -8,8 +8,9 @@ from .collections import Collections from .frontiers import _HSFrontier, Frontiers from .jobs import Jobs +from .proxy import _MappingProxy from .spiders import Spiders -from .utils import _MappingProxy, parse_project_id +from .utils import parse_project_id class Projects(object): diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py new file mode 100644 index 00000000..e0e6a8ce --- /dev/null +++ b/scrapinghub/client/proxy.py @@ -0,0 +1,178 @@ +from __future__ import absolute_import + +import six +import json + +from ..hubstorage.resourcetype import DownloadableResource +from ..hubstorage.resourcetype import ItemsResourceType +from ..hubstorage.collectionsrt import Collections + +from .exceptions import wrap_value_too_large + + +class _Proxy(object): + """A helper to create a class instance and proxy its methods to origin. + + The internal proxy class is useful to link class attributes from its + origin depending on the origin base class as a part of init logic: + + - :class:`~scrapinghub.hubstorage.resourcetype.ItemsResourceType` provides + items-based attributes to access items in an arbitrary collection with + get/write/flush/close/stats/iter methods. + + - :class:`~scrapinghub.hubstorage.resourcetype.DownloadableResource` provides + download-based attributes to iter through collection with or without + msgpack support. + """ + + def __init__(self, cls, client, key): + self.key = key + self._client = client + self._origin = cls(client._hsclient, key) + + if issubclass(cls, ItemsResourceType): + self._proxy_methods(['get', 'write', 'flush', 'close', + 'stats', ('iter', 'list')]) + # redefine write method to wrap hubstorage.ValueTooLarge error + origin_method = getattr(self, 'write') + setattr(self, 'write', wrap_value_too_large(origin_method)) + + # DType iter_values() has more priority than IType list() + # plus Collections interface doesn't need the iter methods + if issubclass(cls, DownloadableResource) and cls is not Collections: + methods = [('iter', 'iter_values'), + ('iter_raw_msgpack', 'iter_msgpack'), + ('iter_raw_json', 'iter_json')] + self._proxy_methods(methods) + self._wrap_iter_methods([method[0] for method in methods]) + + def _proxy_methods(self, methods): + """A little helper for cleaner interface.""" + proxy_methods(self._origin, self, methods) + + def _wrap_iter_methods(self, methods): + """Modify kwargs for all passed self.iter* methods.""" + for method in methods: + wrapped = wrap_kwargs(getattr(self, method), + self._modify_iter_params) + setattr(self, method, wrapped) + + def _modify_iter_params(self, params): + """A helper to modify iter() params on-the-fly. + + The method is internal and should be redefined in subclasses. + + :param params: a dictionary with input parameters. + :return: an updated dictionary with parameters. + :rtype: :class:`dict` + """ + return format_iter_filters(params) + + def list(self, *args, **kwargs): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of elements it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for both + methods). + """ + return list(self.iter(*args, **kwargs)) + + +class _MappingProxy(_Proxy): + """A helper class to support basic get/set interface for dict-like + collections of elements. + """ + + def get(self, key): + """Get element value by key. + + :param key: a string key + """ + return next(self._origin.apiget(key)) + + def set(self, key, value): + """Set element value. + + :param key: a string key + :param value: new value to set for the key + """ + self._origin.apipost(key, data=json.dumps(value), is_idempotent=True) + + def update(self, values): + """Update multiple elements at once. + + The method provides convenient interface for partial updates. + + :param values: a dictionary with key/values to update. + """ + if not isinstance(values, dict): + raise TypeError("values should be a dict") + data = next(self._origin.apiget()) + data.update(values) + self._origin.apipost(jl={k: v for k, v in six.iteritems(data) + if k not in self._origin.ignore_fields}, + is_idempotent=True) + + def delete(self, key): + """Delete element by key. + + :param key: a string key + """ + self._origin.apidelete(key) + + def iter(self): + """Iterate through key/value pairs. + + :return: an iterator over key/value pairs. + :rtype: :class:`collections.Iterable` + """ + return six.iteritems(next(self._origin.apiget())) + + +def proxy_methods(origin, successor, methods): + """A helper to proxy methods from origin to successor. + + Accepts a list with strings and tuples: + + - each string defines: + a successor method name to proxy 1:1 with origin method + - each tuple should consist of 2 strings: + a successor method name and an origin method name + """ + for method in methods: + if isinstance(method, tuple): + successor_name, origin_name = method + else: + successor_name, origin_name = method, method + if not hasattr(successor, successor_name): + setattr(successor, successor_name, getattr(origin, origin_name)) + + +def format_iter_filters(params): + """Format iter() filter param on-the-fly. + + Support passing multiple filters at once as a list with tuples. + """ + filters = params.get('filter') + if filters and isinstance(filters, list): + filter_data = [] + for elem in params.pop('filter'): + if isinstance(elem, six.string_types): + filter_data.append(elem) + elif isinstance(elem, (list, tuple)): + filter_data.append(json.dumps(elem)) + else: + raise ValueError( + "Filter condition must be string, tuple or list") + if filter_data: + params['filter'] = filter_data + return params + + +def wrap_kwargs(fn, kwargs_fn): + """Tiny wrapper to prepare modified version of function kwargs""" + def wrapped(*args, **kwargs): + kwargs = kwargs_fn(kwargs) + return fn(*args, **kwargs) + return wrapped diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index e07cd36e..2f9a217c 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _Proxy class Requests(_Proxy): diff --git a/scrapinghub/client/samples.py b/scrapinghub/client/samples.py index 828677d2..e966abfe 100644 --- a/scrapinghub/client/samples.py +++ b/scrapinghub/client/samples.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .utils import _Proxy +from .proxy import _Proxy class Samples(_Proxy): diff --git a/scrapinghub/client/utils.py b/scrapinghub/client/utils.py index 405eafde..78a51292 100644 --- a/scrapinghub/client/utils.py +++ b/scrapinghub/client/utils.py @@ -8,12 +8,6 @@ import six -from ..hubstorage.resourcetype import DownloadableResource -from ..hubstorage.resourcetype import ItemsResourceType -from ..hubstorage.collectionsrt import Collections - -from .exceptions import wrap_value_too_large - class LogLevel(object): DEBUG = logging.DEBUG @@ -84,174 +78,6 @@ def get_tags_for_update(**kwargs): return params -class _Proxy(object): - """A helper to create a class instance and proxy its methods to origin. - - The internal proxy class is useful to link class attributes from its - origin depending on the origin base class as a part of init logic: - - - :class:`~scrapinghub.hubstorage.resourcetype.ItemsResourceType` provides - items-based attributes to access items in an arbitrary collection with - get/write/flush/close/stats/iter methods. - - - :class:`~scrapinghub.hubstorage.resourcetype.DownloadableResource` provides - download-based attributes to iter through collection with or without - msgpack support. - """ - - def __init__(self, cls, client, key): - self.key = key - self._client = client - self._origin = cls(client._hsclient, key) - - if issubclass(cls, ItemsResourceType): - self._proxy_methods(['get', 'write', 'flush', 'close', - 'stats', ('iter', 'list')]) - # redefine write method to wrap hubstorage.ValueTooLarge error - origin_method = getattr(self, 'write') - setattr(self, 'write', wrap_value_too_large(origin_method)) - - # DType iter_values() has more priority than IType list() - # plus Collections interface doesn't need the iter methods - if issubclass(cls, DownloadableResource) and cls is not Collections: - methods = [('iter', 'iter_values'), - ('iter_raw_msgpack', 'iter_msgpack'), - ('iter_raw_json', 'iter_json')] - self._proxy_methods(methods) - self._wrap_iter_methods([method[0] for method in methods]) - - def _proxy_methods(self, methods): - """A little helper for cleaner interface.""" - proxy_methods(self._origin, self, methods) - - def _wrap_iter_methods(self, methods): - """Modify kwargs for all passed self.iter* methods.""" - for method in methods: - wrapped = wrap_kwargs(getattr(self, method), - self._modify_iter_params) - setattr(self, method, wrapped) - - def _modify_iter_params(self, params): - """A helper to modify iter() params on-the-fly. - - The method is internal and should be redefined in subclasses. - - :param params: a dictionary with input parameters. - :return: an updated dictionary with parameters. - :rtype: :class:`dict` - """ - return format_iter_filters(params) - - def list(self, *args, **kwargs): - """Convenient shortcut to list iter results. - - Please note that :meth:`list` method can use a lot of memory and for a - large amount of elements it's recommended to iterate through it via - :meth:`iter` method (all params and available filters are same for both - methods). - """ - return list(self.iter(*args, **kwargs)) - - -class _MappingProxy(_Proxy): - """A helper class to support basic get/set interface for dict-like - collections of elements. - """ - - def get(self, key): - """Get element value by key. - - :param key: a string key - """ - return next(self._origin.apiget(key)) - - def set(self, key, value): - """Set element value. - - :param key: a string key - :param value: new value to set for the key - """ - self._origin.apipost(key, data=json.dumps(value), is_idempotent=True) - - def update(self, values): - """Update multiple elements at once. - - The method provides convenient interface for partial updates. - - :param values: a dictionary with key/values to update. - """ - if not isinstance(values, dict): - raise TypeError("values should be a dict") - data = next(self._origin.apiget()) - data.update(values) - self._origin.apipost(jl={k: v for k, v in six.iteritems(data) - if k not in self._origin.ignore_fields}, - is_idempotent=True) - - def delete(self, key): - """Delete element by key. - - :param key: a string key - """ - self._origin.apidelete(key) - - def iter(self): - """Iterate through key/value pairs. - - :return: an iterator over key/value pairs. - :rtype: :class:`collections.Iterable` - """ - return six.iteritems(next(self._origin.apiget())) - - -def wrap_kwargs(fn, kwargs_fn): - """Tiny wrapper to prepare modified version of function kwargs""" - def wrapped(*args, **kwargs): - kwargs = kwargs_fn(kwargs) - return fn(*args, **kwargs) - return wrapped - - -def proxy_methods(origin, successor, methods): - """A helper to proxy methods from origin to successor. - - Accepts a list with strings and tuples: - - - each string defines: - a successor method name to proxy 1:1 with origin method - - each tuple should consist of 2 strings: - a successor method name and an origin method name - """ - for method in methods: - if isinstance(method, tuple): - successor_name, origin_name = method - else: - successor_name, origin_name = method, method - if not hasattr(successor, successor_name): - setattr(successor, successor_name, getattr(origin, origin_name)) - - -def format_iter_filters(params): - """Format iter() filter param on-the-fly. - - Support passing multiple filters at once as a list with tuples. - """ - filters = params.get('filter') - if filters and isinstance(filters, list): - filter_data = [] - for elem in params.pop('filter'): - if isinstance(elem, six.string_types): - filter_data.append(elem) - elif isinstance(elem, (list, tuple)): - filter_data.append(json.dumps(elem)) - else: - raise ValueError( - "Filter condition must be string, tuple or list") - if filter_data: - params['filter'] = filter_data - return params - - def update_kwargs(kwargs, **params): """Update kwargs dict with non-empty params with json-encoded values.""" kwargs.update({k: json.dumps(v) if isinstance(v, dict) else v diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 03e4362c..57787ccd 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -5,7 +5,7 @@ import mock from scrapinghub.client.utils import parse_auth -from scrapinghub.client.utils import format_iter_filters +from scrapinghub.client.proxy import format_iter_filters def test_format_iter_filters(): From cc0453d7f4355ebdf83ccfbd9981e2dfbd7bd81d Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 17:30:49 +0300 Subject: [PATCH 27/40] Convert proxy logic to subclasses system --- scrapinghub/client/activity.py | 8 ++- scrapinghub/client/collections.py | 90 ++++++++++++++---------- scrapinghub/client/frontiers.py | 7 +- scrapinghub/client/items.py | 4 +- scrapinghub/client/logs.py | 27 +++++-- scrapinghub/client/proxy.py | 112 ++++++++++++------------------ scrapinghub/client/requests.py | 10 +-- scrapinghub/client/samples.py | 4 +- 8 files changed, 139 insertions(+), 123 deletions(-) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index b888a6e9..5cc6c3ee 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .proxy import _Proxy +from .proxy import _Proxy, format_iter_filters from .utils import parse_job_key @@ -46,8 +46,10 @@ class Activity(_Proxy): """ def __init__(self, *args, **kwargs): super(Activity, self).__init__(*args, **kwargs) - self._proxy_methods([('iter', 'list')]) - self._wrap_iter_methods(['iter']) + + def iter(self, **params): + params = format_iter_filters(params) + return self._origin.list(**params) def add(self, values, **kwargs): """Add new event to the project activity. diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index 868fdc5d..521cc008 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -5,11 +5,11 @@ from ..hubstorage.collectionsrt import Collection as _Collection -from .proxy import _Proxy, proxy_methods, wrap_kwargs, format_iter_filters +from .proxy import _Proxy, _DownloadableProxyMixin, format_iter_filters from .utils import update_kwargs -class Collections(_Proxy): +class Collections(_Proxy, _DownloadableProxyMixin): """Access to project collections. Not a public constructor: use :class:`~scrapinghub.client.projects.Project` @@ -144,42 +144,6 @@ class Collection(object): def __init__(self, client, collections, type_, name): self._client = client self._origin = _Collection(type_, name, collections._origin) - proxy_methods(self._origin, self, [ - 'create_writer', 'count', - ('iter', 'iter_values'), - ('iter_raw_json', 'iter_json'), - ]) - # simplified version of _Proxy._wrap_iter_methods logic - # to provide better support for filter param in iter methods - for method in ['iter', 'iter_raw_json']: - wrapped = wrap_kwargs(getattr(self, method), format_iter_filters) - setattr(self, method, wrapped) - - def list(self, key=None, prefix=None, prefixcount=None, startts=None, - endts=None, requests_params=None, **params): - """Convenient shortcut to list iter results. - - Please note that :meth:`list` method can use a lot of memory and for a - large amount of logs it's recommended to iterate through it - via :meth:`iter` method (all params and available filters are same for - both methods). - - :param key: a string key or a list of keys to filter with. - :param prefix: a string prefix to filter items. - :param prefixcount: maximum number of values to return per prefix. - :param startts: UNIX timestamp at which to begin results. - :param endts: UNIX timestamp at which to end results. - :param requests_params: (optional) a dict with optional requests params. - :param \*\*params: (optional) additional query params for the request. - :return: a list of items where each item is represented with a dict. - :rtype: :class:`list[dict]` - """ - # FIXME there should be similar docstrings for iter/iter_raw_json - # but as we proxy them as-is, it's not in place, should be improved - update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, - startts=startts, endts=endts, - requests_params=requests_params) - return list(self.iter(requests_params=None, **params)) def get(self, key, **params): """Get item from collection by key. @@ -215,6 +179,28 @@ def delete(self, keys): "object providing string keys") self._origin.delete(keys) + def count(self, *args, **kwargs): + return self._origin._collections.count( + self._origin.coltype, self._origin.colname, *args, **kwargs) + + def iter(self, key=None, prefix=None, prefixcount=None, startts=None, + endts=None, requests_params=None, **params): + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts, + requests_params=requests_params) + params = format_iter_filters(params) + return self._origin._collections.iter_values( + self._origin.coltype, self._origin.colname, **params) + + def iter_raw_json(self, key=None, prefix=None, prefixcount=None, + startts=None, endts=None, requests_params=None, **params): + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts, + requests_params=requests_params) + params = format_iter_filters(params) + return self._origin._collections.iter_json( + self._origin.coltype, self._origin.colname, **params) + def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): @@ -234,5 +220,33 @@ def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) + params = format_iter_filters(params) return self._origin._collections.iter_msgpack( self._origin.coltype, self._origin.colname, **params) + + def list(self, key=None, prefix=None, prefixcount=None, startts=None, + endts=None, requests_params=None, **params): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of logs it's recommended to iterate through it + via :meth:`iter` method (all params and available filters are same for + both methods). + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: a list of items where each item is represented with a dict. + :rtype: :class:`list[dict]` + """ + update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, + startts=startts, endts=endts) + return list(self.iter(requests_params=requests_params, **params)) + + def create_writer(self, **kwargs): + return self._origin._collections.create_writer( + self._origin.coltype, self._origin.colname, **kwargs) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index bf2ae721..e9d4db96 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -85,7 +85,6 @@ class Frontiers(_Proxy): """ def __init__(self, *args, **kwargs): super(Frontiers, self).__init__(*args, **kwargs) - self._proxy_methods(['close', 'flush']) def get(self, name): """Get a frontier by name. @@ -121,6 +120,12 @@ def newcount(self): """ return sum(self._origin.newcount.values()) + def flush(self): + self._origin.flush() + + def close(self): + self._origin.close() + class Frontier(object): """Representation of a frontier object. diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 56069344..45d0e7d1 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -1,9 +1,9 @@ from __future__ import absolute_import -from .proxy import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin -class Items(_Proxy): +class Items(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job items. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instanc diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 61efecce..8b53356a 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -1,11 +1,13 @@ from __future__ import absolute_import + import json +import logging -from .proxy import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin from .utils import LogLevel -class Logs(_Proxy): +class Logs(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job logs. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance @@ -47,11 +49,24 @@ class Logs(_Proxy): 'time': 1486375511188, }] """ + def log(self, message, level=logging.INFO, ts=None, **other): + self._origin.log(message, level=level, ts=ts, **other) + + def debug(self, message, **other): + self._origin.debug(message, **other) + + def info(self, message, **other): + self._origin.info(message, **other) + + def warn(self, message, **other): + self._origin.warn(message, **other) + warning = warn + + def error(self, message, **other): + self._origin.error(message, **other) - def __init__(self, *args, **kwargs): - super(Logs, self).__init__(*args, **kwargs) - self._proxy_methods(['log', 'debug', 'info', 'warning', 'warn', - 'error', 'batch_write_start']) + def batch_write_start(self): + return self._origin.batch_write_start() def _modify_iter_params(self, params): """Modify iter() filters on-the-fly. diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index e0e6a8ce..2d2e26da 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -3,10 +3,6 @@ import six import json -from ..hubstorage.resourcetype import DownloadableResource -from ..hubstorage.resourcetype import ItemsResourceType -from ..hubstorage.collectionsrt import Collections - from .exceptions import wrap_value_too_large @@ -30,35 +26,18 @@ def __init__(self, cls, client, key): self._client = client self._origin = cls(client._hsclient, key) - if issubclass(cls, ItemsResourceType): - self._proxy_methods(['get', 'write', 'flush', 'close', - 'stats', ('iter', 'list')]) - # redefine write method to wrap hubstorage.ValueTooLarge error - origin_method = getattr(self, 'write') - setattr(self, 'write', wrap_value_too_large(origin_method)) - - # DType iter_values() has more priority than IType list() - # plus Collections interface doesn't need the iter methods - if issubclass(cls, DownloadableResource) and cls is not Collections: - methods = [('iter', 'iter_values'), - ('iter_raw_msgpack', 'iter_msgpack'), - ('iter_raw_json', 'iter_json')] - self._proxy_methods(methods) - self._wrap_iter_methods([method[0] for method in methods]) - - def _proxy_methods(self, methods): - """A little helper for cleaner interface.""" - proxy_methods(self._origin, self, methods) - - def _wrap_iter_methods(self, methods): - """Modify kwargs for all passed self.iter* methods.""" - for method in methods: - wrapped = wrap_kwargs(getattr(self, method), - self._modify_iter_params) - setattr(self, method, wrapped) + def list(self, *args, **kwargs): + """Convenient shortcut to list iter results. + + Please note that :meth:`list` method can use a lot of memory and for a + large amount of elements it's recommended to iterate through it via + :meth:`iter` method (all params and available filters are same for both + methods). + """ + return list(self.iter(*args, **kwargs)) def _modify_iter_params(self, params): - """A helper to modify iter() params on-the-fly. + """A helper to modify iter*() params on-the-fly. The method is internal and should be redefined in subclasses. @@ -68,15 +47,43 @@ def _modify_iter_params(self, params): """ return format_iter_filters(params) - def list(self, *args, **kwargs): - """Convenient shortcut to list iter results. - Please note that :meth:`list` method can use a lot of memory and for a - large amount of elements it's recommended to iterate through it via - :meth:`iter` method (all params and available filters are same for both - methods). - """ - return list(self.iter(*args, **kwargs)) +class _ItemsResourceProxy(_Proxy): + + def get(self, _key, **params): + return self._origin.get(_key, **params) + + @wrap_value_too_large + def write(self, item): + return self._origin.write(item) + + def iter(self, _key=None, **params): + params = self._modify_iter_params(params) + return self._origin.list(_key, **params) + + def flush(self): + self._origin.flush() + + def stats(self): + return self._origin.stats() + + def close(self, block=True): + self._origin.close(block) + + +class _DownloadableProxyMixin(object): + + def iter(self, _path=None, requests_params=None, **apiparams): + apiparams = self._modify_iter_params(apiparams) + return self._origin.iter_values(_path, requests_params, **apiparams) + + def iter_raw_json(self, _path=None, requests_params=None, **apiparams): + apiparams = self._modify_iter_params(apiparams) + return self._origin.iter_json(_path, requests_params, **apiparams) + + def iter_raw_msgpack(self, _path=None, requests_params=None, **apiparams): + apiparams = self._modify_iter_params(apiparams) + return self._origin.iter_msgpack(_path, requests_params, **apiparams) class _MappingProxy(_Proxy): @@ -130,25 +137,6 @@ def iter(self): return six.iteritems(next(self._origin.apiget())) -def proxy_methods(origin, successor, methods): - """A helper to proxy methods from origin to successor. - - Accepts a list with strings and tuples: - - - each string defines: - a successor method name to proxy 1:1 with origin method - - each tuple should consist of 2 strings: - a successor method name and an origin method name - """ - for method in methods: - if isinstance(method, tuple): - successor_name, origin_name = method - else: - successor_name, origin_name = method, method - if not hasattr(successor, successor_name): - setattr(successor, successor_name, getattr(origin, origin_name)) - - def format_iter_filters(params): """Format iter() filter param on-the-fly. @@ -168,11 +156,3 @@ def format_iter_filters(params): if filter_data: params['filter'] = filter_data return params - - -def wrap_kwargs(fn, kwargs_fn): - """Tiny wrapper to prepare modified version of function kwargs""" - def wrapped(*args, **kwargs): - kwargs = kwargs_fn(kwargs) - return fn(*args, **kwargs) - return wrapped diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 2f9a217c..8a6808f1 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -1,9 +1,9 @@ from __future__ import absolute_import -from .proxy import _Proxy +from .proxy import _ItemsResourceProxy, _DownloadableProxyMixin -class Requests(_Proxy): +class Requests(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job requests. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance @@ -41,6 +41,6 @@ class Requests(_Proxy): 'url': 'https://example.com' }] """ - def __init__(self, *args, **kwargs): - super(Requests, self).__init__(*args, **kwargs) - self._proxy_methods(['add']) + def add(self, url, status, method, rs, parent, duration, ts, fp=None): + return self._origin.add( + url, status, method, rs, parent, duration, ts, fp=None) diff --git a/scrapinghub/client/samples.py b/scrapinghub/client/samples.py index e966abfe..87a8e9bc 100644 --- a/scrapinghub/client/samples.py +++ b/scrapinghub/client/samples.py @@ -1,9 +1,9 @@ from __future__ import absolute_import -from .proxy import _Proxy +from .proxy import _ItemsResourceProxy -class Samples(_Proxy): +class Samples(_ItemsResourceProxy): """Representation of collection of job samples. Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instance From 54484ee7b24b75dac4212212090417f04b9ee6cd Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 17:44:53 +0300 Subject: [PATCH 28/40] Hide format_iter_filters method --- scrapinghub/client/activity.py | 4 ++-- scrapinghub/client/collections.py | 9 +++++---- scrapinghub/client/proxy.py | 4 ++-- tests/client/test_utils.py | 18 +++++++++--------- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 5cc6c3ee..8aff6f82 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -from .proxy import _Proxy, format_iter_filters +from .proxy import _Proxy from .utils import parse_job_key @@ -48,7 +48,7 @@ def __init__(self, *args, **kwargs): super(Activity, self).__init__(*args, **kwargs) def iter(self, **params): - params = format_iter_filters(params) + params = self._modify_iter_params(params) return self._origin.list(**params) def add(self, values, **kwargs): diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index 521cc008..a7fc9a0d 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -5,7 +5,7 @@ from ..hubstorage.collectionsrt import Collection as _Collection -from .proxy import _Proxy, _DownloadableProxyMixin, format_iter_filters +from .proxy import _Proxy, _DownloadableProxyMixin from .utils import update_kwargs @@ -143,6 +143,7 @@ class Collection(object): def __init__(self, client, collections, type_, name): self._client = client + self._collections = collections self._origin = _Collection(type_, name, collections._origin) def get(self, key, **params): @@ -188,7 +189,7 @@ def iter(self, key=None, prefix=None, prefixcount=None, startts=None, update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) - params = format_iter_filters(params) + params = self._collections._modify_iter_params(params) return self._origin._collections.iter_values( self._origin.coltype, self._origin.colname, **params) @@ -197,7 +198,7 @@ def iter_raw_json(self, key=None, prefix=None, prefixcount=None, update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) - params = format_iter_filters(params) + params = self._collections._modify_iter_params(params) return self._origin._collections.iter_json( self._origin.coltype, self._origin.colname, **params) @@ -220,7 +221,7 @@ def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) - params = format_iter_filters(params) + params = self._collections._modify_iter_params(params) return self._origin._collections.iter_msgpack( self._origin.coltype, self._origin.colname, **params) diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index 2d2e26da..51708c05 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -45,7 +45,7 @@ def _modify_iter_params(self, params): :return: an updated dictionary with parameters. :rtype: :class:`dict` """ - return format_iter_filters(params) + return _format_iter_filters(params) class _ItemsResourceProxy(_Proxy): @@ -137,7 +137,7 @@ def iter(self): return six.iteritems(next(self._origin.apiget())) -def format_iter_filters(params): +def _format_iter_filters(params): """Format iter() filter param on-the-fly. Support passing multiple filters at once as a list with tuples. diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 57787ccd..201ee78d 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -5,42 +5,42 @@ import mock from scrapinghub.client.utils import parse_auth -from scrapinghub.client.proxy import format_iter_filters +from scrapinghub.client.proxy import _format_iter_filters def test_format_iter_filters(): # work with empty params - assert format_iter_filters({}) == {} + assert _format_iter_filters({}) == {} # doesn't affect other params params = {'a': 123, 'b': 456} - assert format_iter_filters(params) == params + assert _format_iter_filters(params) == params # pass filter as-is if not list params = {'filter': 'some-string'} - assert format_iter_filters(params) == params + assert _format_iter_filters(params) == params # work fine with empty filter params = {'filter': []} - assert format_iter_filters(params) == params + assert _format_iter_filters(params) == params # pass string filters as-is params = {'filter': ['str1', 'str2']} - assert format_iter_filters(params) == params + assert _format_iter_filters(params) == params # converts list-formatted filters params = {'filter': [['field', '>=', ['val']], 'filter2']} - assert (format_iter_filters(params) == + assert (_format_iter_filters(params) == {'filter': ['["field", ">=", ["val"]]', 'filter2']}) # works the same with tuple entries params = {'filter': [('field', '==', ['val'])]} - assert (format_iter_filters(params) == + assert (_format_iter_filters(params) == {'filter': ['["field", "==", ["val"]]']}) # exception if entry is not list/tuple or string with pytest.raises(ValueError): - format_iter_filters({'filter': ['test', 123]}) + _format_iter_filters({'filter': ['test', 123]}) def test_parse_auth_none(): From dc76cb7cb3eb2c35afe5d4e41130aa3132e61488 Mon Sep 17 00:00:00 2001 From: Alexander Chekunkov Date: Tue, 28 Mar 2017 15:56:47 +0100 Subject: [PATCH 29/40] Make exception wrappers private Make wrap_http_errors private and remove wrap_value_too_large because it's not needed any more. --- scrapinghub/client/__init__.py | 7 +++---- scrapinghub/client/exceptions.py | 14 +------------- scrapinghub/client/proxy.py | 9 ++++++--- scrapinghub/client/spiders.py | 7 +++---- 4 files changed, 13 insertions(+), 24 deletions(-) diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index a751075c..0cf5d080 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -1,9 +1,8 @@ from scrapinghub import Connection as _Connection from scrapinghub import HubstorageClient as _HubstorageClient +from .exceptions import _wrap_http_errors from .projects import Projects -from .exceptions import wrap_http_errors - from .utils import parse_auth from .utils import parse_project_id, parse_job_key @@ -13,14 +12,14 @@ class Connection(_Connection): - @wrap_http_errors + @_wrap_http_errors def _request(self, *args, **kwargs): return super(Connection, self)._request(*args, **kwargs) class HubstorageClient(_HubstorageClient): - @wrap_http_errors + @_wrap_http_errors def request(self, *args, **kwargs): return super(HubstorageClient, self).request(*args, **kwargs) diff --git a/scrapinghub/client/exceptions.py b/scrapinghub/client/exceptions.py index d79b2eac..6a4b405c 100644 --- a/scrapinghub/client/exceptions.py +++ b/scrapinghub/client/exceptions.py @@ -5,7 +5,6 @@ from requests import HTTPError from ..legacy import APIError -from ..hubstorage import ValueTooLarge as _ValueTooLarge def _get_http_error_msg(exc): @@ -57,7 +56,7 @@ class ServerError(ScrapinghubAPIError): """Indicates some server error: something unexpected has happened.""" -def wrap_http_errors(method): +def _wrap_http_errors(method): """Internal helper to handle exceptions gracefully.""" @wraps(method) def wrapped(*args, **kwargs): @@ -92,14 +91,3 @@ def wrapped(*args, **kwargs): raise ServerError(http_error=exc) raise ScrapinghubAPIError(msg) return wrapped - - -def wrap_value_too_large(method): - """Internal wrapper for ValueTooLarge exception.""" - @wraps(method) - def wrapped(*args, **kwargs): - try: - return method(*args, **kwargs) - except _ValueTooLarge as exc: - raise ValueTooLarge(str(exc)) - return wrapped diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index 51708c05..c83a0d85 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -3,7 +3,8 @@ import six import json -from .exceptions import wrap_value_too_large +from ..hubstorage import ValueTooLarge as _ValueTooLarge +from .exceptions import ValueTooLarge class _Proxy(object): @@ -53,9 +54,11 @@ class _ItemsResourceProxy(_Proxy): def get(self, _key, **params): return self._origin.get(_key, **params) - @wrap_value_too_large def write(self, item): - return self._origin.write(item) + try: + return self._origin.write(item) + except _ValueTooLarge as exc: + raise ValueTooLarge(str(exc)) def iter(self, _key=None, **params): params = self._modify_iter_params(params) diff --git a/scrapinghub/client/spiders.py b/scrapinghub/client/spiders.py index 1d8e4e81..1d665801 100644 --- a/scrapinghub/client/spiders.py +++ b/scrapinghub/client/spiders.py @@ -2,9 +2,8 @@ from requests.compat import urljoin +from .exceptions import NotFound, _wrap_http_errors from .jobs import Jobs -from .exceptions import NotFound -from .exceptions import wrap_http_errors from .utils import get_tags_for_update @@ -104,7 +103,7 @@ def __init__(self, client, project_id, spider_id, spider): self.jobs = Jobs(client, project_id, self) self._client = client - @wrap_http_errors + @_wrap_http_errors def update_tags(self, add=None, remove=None): """Update tags for the spider. @@ -118,7 +117,7 @@ def update_tags(self, add=None, remove=None): response = self._client._connection._session.patch(url, json=params) response.raise_for_status() - @wrap_http_errors + @_wrap_http_errors def list_tags(self): """List spider tags. From ac2a05234f8ed42a59f5253ad00d1ad907073b10 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 18:12:59 +0300 Subject: [PATCH 30/40] Add docstrings for proxy module --- scrapinghub/client/proxy.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index c83a0d85..638f3bcb 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -55,36 +55,70 @@ def get(self, _key, **params): return self._origin.get(_key, **params) def write(self, item): + """Write new element to collection.""" try: return self._origin.write(item) except _ValueTooLarge as exc: raise ValueTooLarge(str(exc)) def iter(self, _key=None, **params): + """Iterate over elements in collection. + + :return: a generator object over a list of element dictionaries. + :rtype: :class:`types.GeneratorType[dict]` + """ + # TODO describe allowable params params = self._modify_iter_params(params) return self._origin.list(_key, **params) def flush(self): + """Flush data from writer threads.""" self._origin.flush() def stats(self): + """Get resource stats. + + :return: a dictionary with stats data. + :rtype: :class:`dict` + """ return self._origin.stats() def close(self, block=True): + """Close writers one-by-one.""" self._origin.close(block) class _DownloadableProxyMixin(object): def iter(self, _path=None, requests_params=None, **apiparams): + """A general method to iterate through elements. + + :return: an iterator over elements list. + :rtype: :class:`collections.Iterable` + """ + # TODO describe allowable params apiparams = self._modify_iter_params(apiparams) return self._origin.iter_values(_path, requests_params, **apiparams) def iter_raw_json(self, _path=None, requests_params=None, **apiparams): + """A method to iterate through raw json-packed elements. + Can be convenient if data is needed in raw json format. + + :return: an iterator over elements list packed with json. + :rtype: :class:`collections.Iterable[str]` + """ + # TODO describe allowable params apiparams = self._modify_iter_params(apiparams) return self._origin.iter_json(_path, requests_params, **apiparams) def iter_raw_msgpack(self, _path=None, requests_params=None, **apiparams): + """A method to iterate through raw msgpack-ed elements. + Can be convenient if data is needed in same msgpack format. + + :return: an iterator over elements list packed with msgpack. + :rtype: :class:`collections.Iterable[bytes]` + """ + # TODO describe allowable params apiparams = self._modify_iter_params(apiparams) return self._origin.iter_msgpack(_path, requests_params, **apiparams) From 93582c3a8bdb082a2b98cab3d2d81365239b164b Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 18:38:01 +0300 Subject: [PATCH 31/40] Add missing docstrings --- scrapinghub/client/activity.py | 9 ++++--- scrapinghub/client/collections.py | 45 ++++++++++++++++++++++++++++++- scrapinghub/client/frontiers.py | 2 ++ scrapinghub/client/logs.py | 12 +++++++++ scrapinghub/client/requests.py | 11 ++++++++ 5 files changed, 75 insertions(+), 4 deletions(-) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 8aff6f82..41ef217b 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -44,10 +44,13 @@ class Activity(_Proxy): >>> project.activity.add(events) """ - def __init__(self, *args, **kwargs): - super(Activity, self).__init__(*args, **kwargs) - def iter(self, **params): + """Iterate over activity events. + + :return: a generator object over a list of activity event dicts. + :rtype: :class:`types.GeneratorType[dict]` + """ + # TODO describe allowable params params = self._modify_iter_params(params) return self._origin.list(**params) diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index a7fc9a0d..751f0213 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -181,11 +181,29 @@ def delete(self, keys): self._origin.delete(keys) def count(self, *args, **kwargs): + """Count collection items with a given filters. + + :return: amount of elements in collection. + :rtype: :class:`int` + """ + # TODO describe allowable params return self._origin._collections.count( self._origin.coltype, self._origin.colname, *args, **kwargs) def iter(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): + """A method to iterate through collection items. + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: an iterator over items list. + :rtype: :class:`collections.Iterable[dict]` + """ update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) @@ -195,6 +213,19 @@ def iter(self, key=None, prefix=None, prefixcount=None, startts=None, def iter_raw_json(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): + """A method to iterate through json pack-ed items. + Can be convenient if data is needed in the json format. + + :param key: a string key or a list of keys to filter with. + :param prefix: a string prefix to filter items. + :param prefixcount: maximum number of values to return per prefix. + :param startts: UNIX timestamp at which to begin results. + :param endts: UNIX timestamp at which to end results. + :param requests_params: (optional) a dict with optional requests params. + :param \*\*params: (optional) additional query params for the request. + :return: an iterator over items list packed with json. + :rtype: :class:`collections.Iterable[str]` + """ update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, startts=startts, endts=endts, requests_params=requests_params) @@ -248,6 +279,18 @@ def list(self, key=None, prefix=None, prefixcount=None, startts=None, startts=startts, endts=endts) return list(self.iter(requests_params=requests_params, **params)) - def create_writer(self, **kwargs): + def create_writer(self, start=0, auth=None, size=1000, interval=15, + qsize=None, content_encoding='identity', + maxitemsize=1024 ** 2, callback=None): + """Create a new writer for a collection. + + :return: a new writer object. + :rtype: :class:`~scrapinghub.hubstorage.batchuploader._BatchWriter` + """ + # TODO describe allowable params + kwargs = {} + update_kwargs(start=start, auth=auth, size=size, interval=interval, + qsize=qsize, content_encoding=content_encoding, + maxitemsize=maxitemsize, callback=callback) return self._origin._collections.create_writer( self._origin.coltype, self._origin.colname, **kwargs) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index e9d4db96..a6c2f30e 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -121,9 +121,11 @@ def newcount(self): return sum(self._origin.newcount.values()) def flush(self): + """Flush data in all frontiers writer threads.""" self._origin.flush() def close(self): + """Close frontier writer threads one-by-one.""" self._origin.close() diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 8b53356a..130e5720 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -50,22 +50,34 @@ class Logs(_ItemsResourceProxy, _DownloadableProxyMixin): }] """ def log(self, message, level=logging.INFO, ts=None, **other): + """Base method to write a log entry. + + :param message: a string message. + :param level: (optional) logging level, default to INFO. + :param ts: (optional) unix timestamp in milliseconds. + :param \*\*other: other optional kwargs. + """ self._origin.log(message, level=level, ts=ts, **other) def debug(self, message, **other): + """Log a message with DEBUG level.""" self._origin.debug(message, **other) def info(self, message, **other): + """Log a message with INFO level.""" self._origin.info(message, **other) def warn(self, message, **other): + """Log a message with WARN level.""" self._origin.warn(message, **other) warning = warn def error(self, message, **other): + """Log a message with ERROR level.""" self._origin.error(message, **other) def batch_write_start(self): + """Override to set a start parameter when commencing writing.""" return self._origin.batch_write_start() def _modify_iter_params(self, params): diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 8a6808f1..13acf6e6 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -42,5 +42,16 @@ class Requests(_ItemsResourceProxy, _DownloadableProxyMixin): }] """ def add(self, url, status, method, rs, parent, duration, ts, fp=None): + """ Add a new requests. + + :param url: string url for the request. + :param status: HTTP status of the request. + :param method: stringified request method. + :param rs: response body length. + :param parent: parent request id or ``None``. + :param duration: request duration in milliseconds. + :param ts: unix timestamp in milliseconds. + :param fp: (optional) string fingerprint for the request. + """ return self._origin.add( url, status, method, rs, parent, duration, ts, fp=None) From 22f646f19e59aff98a6b668361ecf7bb7b4359fb Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 19:35:56 +0300 Subject: [PATCH 32/40] Add count param for iter* methods --- scrapinghub/client/activity.py | 7 ++++--- scrapinghub/client/collections.py | 14 ++++++++++++-- scrapinghub/client/proxy.py | 23 +++++++++++++++-------- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 41ef217b..665ee3b6 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -1,7 +1,7 @@ from __future__ import absolute_import from .proxy import _Proxy -from .utils import parse_job_key +from .utils import parse_job_key, update_kwargs class Activity(_Proxy): @@ -44,13 +44,14 @@ class Activity(_Proxy): >>> project.activity.add(events) """ - def iter(self, **params): + def iter(self, count=None, **params): """Iterate over activity events. + :param count: limit amount of elements. :return: a generator object over a list of activity event dicts. :rtype: :class:`types.GeneratorType[dict]` """ - # TODO describe allowable params + update_kwargs(params, count=count) params = self._modify_iter_params(params) return self._origin.list(**params) diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index 751f0213..ee7c4bd1 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -284,12 +284,22 @@ def create_writer(self, start=0, auth=None, size=1000, interval=15, maxitemsize=1024 ** 2, callback=None): """Create a new writer for a collection. + :param start: (optional) initial offset for writer thread. + :param auth: (optional) set auth credentials for the request. + :param size: (optional) set initial queue size. + :param interval: (optional) set interval for writer thread. + :param qsize: (optional) setup max queue size for the writer. + :param content_encoding: (optional) set different Content-Encoding header. + :param maxitemsize: (optional) max item size in bytes. + :param callback: (optional) some callback function. :return: a new writer object. :rtype: :class:`~scrapinghub.hubstorage.batchuploader._BatchWriter` + + If provided - calllback shouldn't try to inject more items in the queue, + otherwise it can lead to deadlocks. """ - # TODO describe allowable params kwargs = {} - update_kwargs(start=start, auth=auth, size=size, interval=interval, + update_kwargs(kwargs, start=start, auth=auth, size=size, interval=interval, qsize=qsize, content_encoding=content_encoding, maxitemsize=maxitemsize, callback=callback) return self._origin._collections.create_writer( diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index 638f3bcb..b15af190 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -4,6 +4,7 @@ import json from ..hubstorage import ValueTooLarge as _ValueTooLarge +from .utils import update_kwargs from .exceptions import ValueTooLarge @@ -61,13 +62,14 @@ def write(self, item): except _ValueTooLarge as exc: raise ValueTooLarge(str(exc)) - def iter(self, _key=None, **params): + def iter(self, _key=None, count=None, **params): """Iterate over elements in collection. + :param count: limit amount of elements. :return: a generator object over a list of element dictionaries. :rtype: :class:`types.GeneratorType[dict]` """ - # TODO describe allowable params + update_kwargs(params or {}, count=count) params = self._modify_iter_params(params) return self._origin.list(_key, **params) @@ -90,35 +92,40 @@ def close(self, block=True): class _DownloadableProxyMixin(object): - def iter(self, _path=None, requests_params=None, **apiparams): + def iter(self, _path=None, count=None, requests_params=None, **apiparams): """A general method to iterate through elements. + :param count: limit amount of elements. :return: an iterator over elements list. :rtype: :class:`collections.Iterable` """ - # TODO describe allowable params + update_kwargs(apiparams, count=count) apiparams = self._modify_iter_params(apiparams) return self._origin.iter_values(_path, requests_params, **apiparams) - def iter_raw_json(self, _path=None, requests_params=None, **apiparams): + def iter_raw_json(self, _path=None, count=None, requests_params=None, + **apiparams): """A method to iterate through raw json-packed elements. Can be convenient if data is needed in raw json format. + :param count: limit amount of elements. :return: an iterator over elements list packed with json. :rtype: :class:`collections.Iterable[str]` """ - # TODO describe allowable params + update_kwargs(apiparams, count=count) apiparams = self._modify_iter_params(apiparams) return self._origin.iter_json(_path, requests_params, **apiparams) - def iter_raw_msgpack(self, _path=None, requests_params=None, **apiparams): + def iter_raw_msgpack(self, _path=None, count=None, requests_params=None, + **apiparams): """A method to iterate through raw msgpack-ed elements. Can be convenient if data is needed in same msgpack format. + :param count: limit amount of elements. :return: an iterator over elements list packed with msgpack. :rtype: :class:`collections.Iterable[bytes]` """ - # TODO describe allowable params + update_kwargs(apiparams, count=count) apiparams = self._modify_iter_params(apiparams) return self._origin.iter_msgpack(_path, requests_params, **apiparams) From 5cd822189cd577526d3a3ff0d88d3fb83f7a57a6 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Tue, 28 Mar 2017 19:37:32 +0300 Subject: [PATCH 33/40] Mv proxy tests to a separate module --- tests/client/test_proxy.py | 38 ++++++++++++++++++++++++++++++++++++++ tests/client/test_utils.py | 36 ------------------------------------ 2 files changed, 38 insertions(+), 36 deletions(-) create mode 100644 tests/client/test_proxy.py diff --git a/tests/client/test_proxy.py b/tests/client/test_proxy.py new file mode 100644 index 00000000..7fd4f272 --- /dev/null +++ b/tests/client/test_proxy.py @@ -0,0 +1,38 @@ +import pytest + +from scrapinghub.client.proxy import _format_iter_filters + + +def test_format_iter_filters(): + # work with empty params + assert _format_iter_filters({}) == {} + + # doesn't affect other params + params = {'a': 123, 'b': 456} + assert _format_iter_filters(params) == params + + # pass filter as-is if not list + params = {'filter': 'some-string'} + assert _format_iter_filters(params) == params + + # work fine with empty filter + params = {'filter': []} + assert _format_iter_filters(params) == params + + # pass string filters as-is + params = {'filter': ['str1', 'str2']} + assert _format_iter_filters(params) == params + + # converts list-formatted filters + params = {'filter': [['field', '>=', ['val']], 'filter2']} + assert (_format_iter_filters(params) == + {'filter': ['["field", ">=", ["val"]]', 'filter2']}) + + # works the same with tuple entries + params = {'filter': [('field', '==', ['val'])]} + assert (_format_iter_filters(params) == + {'filter': ['["field", "==", ["val"]]']}) + + # exception if entry is not list/tuple or string + with pytest.raises(ValueError): + _format_iter_filters({'filter': ['test', 123]}) diff --git a/tests/client/test_utils.py b/tests/client/test_utils.py index 201ee78d..f109894c 100644 --- a/tests/client/test_utils.py +++ b/tests/client/test_utils.py @@ -5,42 +5,6 @@ import mock from scrapinghub.client.utils import parse_auth -from scrapinghub.client.proxy import _format_iter_filters - - -def test_format_iter_filters(): - # work with empty params - assert _format_iter_filters({}) == {} - - # doesn't affect other params - params = {'a': 123, 'b': 456} - assert _format_iter_filters(params) == params - - # pass filter as-is if not list - params = {'filter': 'some-string'} - assert _format_iter_filters(params) == params - - # work fine with empty filter - params = {'filter': []} - assert _format_iter_filters(params) == params - - # pass string filters as-is - params = {'filter': ['str1', 'str2']} - assert _format_iter_filters(params) == params - - # converts list-formatted filters - params = {'filter': [['field', '>=', ['val']], 'filter2']} - assert (_format_iter_filters(params) == - {'filter': ['["field", ">=", ["val"]]', 'filter2']}) - - # works the same with tuple entries - params = {'filter': [('field', '==', ['val'])]} - assert (_format_iter_filters(params) == - {'filter': ['["field", "==", ["val"]]']}) - - # exception if entry is not list/tuple or string - with pytest.raises(ValueError): - _format_iter_filters({'filter': ['test', 123]}) def test_parse_auth_none(): From 290664c0a6ed3f3f16cc044ed4488ff8365813e0 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 10:21:44 +0300 Subject: [PATCH 34/40] Add missing docstring for _ItemsResourceProxy.get --- scrapinghub/client/proxy.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index b15af190..ffe99837 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -52,8 +52,14 @@ def _modify_iter_params(self, params): class _ItemsResourceProxy(_Proxy): - def get(self, _key, **params): - return self._origin.get(_key, **params) + def get(self, key, **params): + """Get element from collection. + + :param key: element key. + :return: a dictionary with element data. + :rtype: :class:`dict` + """ + return self._origin.get(key, **params) def write(self, item): """Write new element to collection.""" From 401064a63914520b0ea2322409e24b1b91c80d37 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 10:31:22 +0300 Subject: [PATCH 35/40] Minor fixes, move project settings section down in overview --- docs/client/overview.rst | 52 ++++++++++++++++++------------------- docs/quickstart.rst | 2 +- scrapinghub/client/proxy.py | 5 +++- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/docs/client/overview.rst b/docs/client/overview.rst index 2b32314f..bc195c3c 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -20,7 +20,8 @@ for access to client projects. Projects -------- -You can list the projects available to your account:: +You can list the :class:`~scrapinghub.client.projects.Projects` available to your +account:: >>> client.projects.list() [123, 456] @@ -67,31 +68,6 @@ For example, to schedule a spider run (it returns a > -Settings --------- - -You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. - -To get a list of the project settings:: - - >>> project.settings.list() - [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - -To get a project setting value by name:: - - >>> project.settings.get('job_runtime_limit') - 24 - -To update a project setting value by name:: - - >>> project.settings.set('job_runtime_limit', 20) - -Or update a few project settings at once:: - - >>> project.settings.update({'default_job_units': 1, - ... 'job_runtime_limit': 20}) - - Spiders ------- @@ -428,6 +404,30 @@ Or post multiple events at once:: >>> project.activity.add(events) +Settings +-------- + +You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. + +To get a list of the project settings:: + + >>> project.settings.list() + [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] + +To get a project setting value by name:: + + >>> project.settings.get('job_runtime_limit') + 24 + +To update a project setting value by name:: + + >>> project.settings.set('job_runtime_limit', 20) + +Or update a few project settings at once:: + + >>> project.settings.update({'default_job_units': 1, + ... 'job_runtime_limit': 20}) + Collections ----------- diff --git a/docs/quickstart.rst b/docs/quickstart.rst index f484bd35..3fced7e0 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -69,7 +69,7 @@ By default, tests use VCR.py ``once`` mode to: It means that if you add new integration tests and run all tests as usual, only new cassettes will be created, all existing cassettes will stay unmodified. -To ignore existing cassettes and use real service, please provide a flag:: +To ignore existing cassettes and use real services, please provide a flag:: py.test --ignore-cassettes diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index ffe99837..7d399643 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -62,7 +62,10 @@ def get(self, key, **params): return self._origin.get(key, **params) def write(self, item): - """Write new element to collection.""" + """Write new element to collection. + + :param item: element data dict to write. + """ try: return self._origin.write(item) except _ValueTooLarge as exc: From c1399be162f7a22ff026b104f4a937b599bb59a9 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 10:46:17 +0300 Subject: [PATCH 36/40] Refactor and minor fixes for overview --- docs/client/overview.rst | 93 +++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 44 deletions(-) diff --git a/docs/client/overview.rst b/docs/client/overview.rst index bc195c3c..6a9c2c28 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -136,12 +136,12 @@ Use ``run`` method to run a new job for project/spider:: Scheduling logic supports different options, like -- job_args to provide arguments for the job -- units to specify amount of units to run the job -- job_settings to pass additional settings for the job -- priority to set higher/lower priority of the job -- add_tag to create a job with a set of initial tags -- meta to pass additional custom metadata +- **job_args** to provide arguments for the job +- **units** to specify amount of units to run the job +- **job_settings** to pass additional settings for the job +- **priority** to set higher/lower priority of the job +- **add_tag** to create a job with a set of initial tags +- **meta** to pass additional custom metadata For example, to run a new job for a given spider with custom params:: @@ -211,8 +211,9 @@ To get jobs filtered by tags:: >>> jobs_summary = project.jobs.iter(has_tag=['new', 'verified'], lacks_tag='obsolete') -List of tags has ``OR`` power, so in the case above jobs with 'new' or -'verified' tag are expected. +List of tags in **has_tag** has ``OR`` power, so in the case above jobs with +``new`` or ``verified`` tag are expected (while list of tags in **lacks_tag** +has ``AND`` power). To get certain number of last finished jobs per some spider:: @@ -227,7 +228,7 @@ for filtering by state: - deleted Dict entries returned by ``iter`` method contain some additional meta, -but can be easily converted to ``Job`` instances with:: +but can be easily converted to :class:`~scrapinghub.client.jobs.Job` instances with:: >>> [Job(x['key']) for x in jobs] [ @@ -266,6 +267,25 @@ It's also possible to get last jobs summary (for each spider):: Note that there can be a lot of spiders, so the method above returns an iterator. + +update_tags +^^^^^^^^^^^ + +Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). + + +To mark all spider jobs with tag ``consumed``:: + + >>> spider.jobs.update_tags(add=['consumed']) + +To remove existing tag ``existing`` for all spider jobs:: + + >>> spider.jobs.update_tags(remove=['existing']) + +Modifying tags is available on :class:`~scrapinghub.client.spiders.Spider`/ +:class:`~scrapinghub.client.jobs.Job` levels. + + Job --- @@ -286,6 +306,10 @@ To delete a job:: >>> job.delete() +To mark a job with tag ``consumed``:: + + >>> job.update_tags(add=['consumed']) + .. _job-metadata: Metadata @@ -404,31 +428,6 @@ Or post multiple events at once:: >>> project.activity.add(events) -Settings --------- - -You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. - -To get a list of the project settings:: - - >>> project.settings.list() - [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - -To get a project setting value by name:: - - >>> project.settings.get('job_runtime_limit') - 24 - -To update a project setting value by name:: - - >>> project.settings.set('job_runtime_limit', 20) - -Or update a few project settings at once:: - - >>> project.settings.update({'default_job_units': 1, - ... 'job_runtime_limit': 20}) - - Collections ----------- @@ -559,24 +558,30 @@ Frontiers are available on project level only. .. _job-tags: -Tags ----- -Tags is a convenient way to mark specific jobs (for better search, postprocessing etc). +Settings +-------- -To mark a job with tag ``consumed``:: +You can work with project settings via :class:`~scrapinghub.client.projects.Settings`. - >>> job.update_tags(add=['consumed']) +To get a list of the project settings:: -To mark all spider jobs with tag ``consumed``:: + >>> project.settings.list() + [(u'default_job_units', 2), (u'job_runtime_limit', 24)]] - >>> spider.jobs.update_tags(add=['consumed']) +To get a project setting value by name:: -To remove existing tag ``existing`` for all spider jobs:: + >>> project.settings.get('job_runtime_limit') + 24 - >>> spider.jobs.update_tags(remove=['existing']) +To update a project setting value by name:: + + >>> project.settings.set('job_runtime_limit', 20) -Modifying tags is available on spider/job levels. +Or update a few project settings at once:: + + >>> project.settings.update({'default_job_units': 1, + ... 'job_runtime_limit': 20}) Exceptions From b06ae7059d0b9a4628e044867e11de87c7186a7e Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 11:05:19 +0300 Subject: [PATCH 37/40] Drop iter_raw_* methods --- scrapinghub/client/collections.py | 45 ------------------- scrapinghub/client/proxy.py | 26 ----------- .../test_logs/test_logs_iter_raw_json.gz | 1 - .../test_logs/test_logs_iter_raw_msgpack.gz | 1 - .../test_requests_iter_raw_json.gz | 1 - tests/client/test_items.py | 16 ------- tests/client/test_logs.py | 34 -------------- tests/client/test_requests.py | 17 ------- 8 files changed, 141 deletions(-) delete mode 100644 tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz delete mode 100644 tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz delete mode 100644 tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index ee7c4bd1..960f0bfd 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -211,51 +211,6 @@ def iter(self, key=None, prefix=None, prefixcount=None, startts=None, return self._origin._collections.iter_values( self._origin.coltype, self._origin.colname, **params) - def iter_raw_json(self, key=None, prefix=None, prefixcount=None, - startts=None, endts=None, requests_params=None, **params): - """A method to iterate through json pack-ed items. - Can be convenient if data is needed in the json format. - - :param key: a string key or a list of keys to filter with. - :param prefix: a string prefix to filter items. - :param prefixcount: maximum number of values to return per prefix. - :param startts: UNIX timestamp at which to begin results. - :param endts: UNIX timestamp at which to end results. - :param requests_params: (optional) a dict with optional requests params. - :param \*\*params: (optional) additional query params for the request. - :return: an iterator over items list packed with json. - :rtype: :class:`collections.Iterable[str]` - """ - update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, - startts=startts, endts=endts, - requests_params=requests_params) - params = self._collections._modify_iter_params(params) - return self._origin._collections.iter_json( - self._origin.coltype, self._origin.colname, **params) - - def iter_raw_msgpack(self, key=None, prefix=None, prefixcount=None, - startts=None, endts=None, requests_params=None, - **params): - """A method to iterate through raw msgpack-ed items. - Can be convenient if data is needed in same msgpack format. - - :param key: a string key or a list of keys to filter with. - :param prefix: a string prefix to filter items. - :param prefixcount: maximum number of values to return per prefix. - :param startts: UNIX timestamp at which to begin results. - :param endts: UNIX timestamp at which to end results. - :param requests_params: (optional) a dict with optional requests params. - :param \*\*params: (optional) additional query params for the request. - :return: an iterator over items list packed with msgpack. - :rtype: :class:`collections.Iterable[bytes]` - """ - update_kwargs(params, key=key, prefix=prefix, prefixcount=prefixcount, - startts=startts, endts=endts, - requests_params=requests_params) - params = self._collections._modify_iter_params(params) - return self._origin._collections.iter_msgpack( - self._origin.coltype, self._origin.colname, **params) - def list(self, key=None, prefix=None, prefixcount=None, startts=None, endts=None, requests_params=None, **params): """Convenient shortcut to list iter results. diff --git a/scrapinghub/client/proxy.py b/scrapinghub/client/proxy.py index 7d399643..6f247b0d 100644 --- a/scrapinghub/client/proxy.py +++ b/scrapinghub/client/proxy.py @@ -112,32 +112,6 @@ def iter(self, _path=None, count=None, requests_params=None, **apiparams): apiparams = self._modify_iter_params(apiparams) return self._origin.iter_values(_path, requests_params, **apiparams) - def iter_raw_json(self, _path=None, count=None, requests_params=None, - **apiparams): - """A method to iterate through raw json-packed elements. - Can be convenient if data is needed in raw json format. - - :param count: limit amount of elements. - :return: an iterator over elements list packed with json. - :rtype: :class:`collections.Iterable[str]` - """ - update_kwargs(apiparams, count=count) - apiparams = self._modify_iter_params(apiparams) - return self._origin.iter_json(_path, requests_params, **apiparams) - - def iter_raw_msgpack(self, _path=None, count=None, requests_params=None, - **apiparams): - """A method to iterate through raw msgpack-ed elements. - Can be convenient if data is needed in same msgpack format. - - :param count: limit amount of elements. - :return: an iterator over elements list packed with msgpack. - :rtype: :class:`collections.Iterable[bytes]` - """ - update_kwargs(apiparams, count=count) - apiparams = self._modify_iter_params(apiparams) - return self._origin.iter_msgpack(_path, requests_params, **apiparams) - class _MappingProxy(_Proxy): """A helper class to support basic get/set interface for dict-like diff --git a/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz b/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz deleted file mode 100644 index bce71e44..00000000 --- a/tests/client/cassetes/test_logs/test_logs_iter_raw_json.gz +++ /dev/null @@ -1 +0,0 @@ -eJyll/1XE2cWx0HrG+tbt12VVbcBDY1K3g1ENCoQ3kQEJOhoO7pD8pAZCJncmQkvulGKFS2K5VWsrgiV2tquortlPSurnnOfX/av2f9gz9lnngSxLnvadZNMcu7NnTz3uffzvTPpXZaCLJuwNisrS4kbRJPChqLGdcgWYZktBcttwmr2lUb0BPMSeC8FK2zCKuaSiRQhmg4rU7DKJrzHPEHJILBahDXCdmZVakqhxbPXUidpFo/LXWxxe0u8/hKXy1JVF4IcSVjJgpqI1kk0+JUIa4VtzD7ZVFWT9jldDo+lvKHHkNW40+vY6yiCdRJf5oSk9cB6ETYI65lVGg6ThFFoKVfVdoXARklYYXpjMbUL3hfh18I6ZjbUN4UKLfUNoZr6Y03wgcS3W66y/cYNe6gnQeBDEX4jbGReKZGIKWHJLIKzTVfjsElK8mVb1EgPbE7BFp64bmhKPAq5wsfMOJ+vG5KR1PNL8tX2/ML8NrVFiTDDk3443U5fkS8/Bb/VM+eawbA1BdvSpewgui5FCWwXljGrvhZ+x1cMqxECH9X+I5nkURqBJNENsKQgzyZvFqzMl9DUNhI2ApmVCvSEwpoSkHW7wULtaRPyZdakHTZeitIkq6imnOM7hJ0iWAUPc5dJuhK2nO74pc9TRVAgCTnszGadaPbSKKskfCyCjfc+wdtm18OalGBlkpMtTo/DxZ4R0umCXZKw4XXn7BVxtk+zlrtF2MNTjJ5TEoWWCGmNmUAVpknJ9NcugkOoNYuja62G2k7inkBLUArWdRc1x6uUruLiU+XdSijS1tJVU+U2fDHDr4aPlEKoo12PNJeHaoqrG4/G1NZOzZWoIKQoFPM3gHMJIFwiuIW8t4Dotnd1ddlbVa3DntRixMycRMCTLgQ7PU64esArwl7uayckYZdiSicBn8R5XVjkKIlHDRmKRCjmXfcWgz+903RdYJ8IJcJyZu927ob9DMKVnBRW2Agc4HyYVEOAxyQ1BQ4KO01ZGkaixOn0eh3pl89d4nf5XU7WCKeWjDs404eSyRQctslMzaU2E44y21LJlYsQFLKZ3wUVkrA1k5uu2804TY3Zuczs9ZoSVeJQKUIVj94N1UsUtEaEI0Iu8xqk23AmYpIS328Jy5KmEyPQHKq0+6FWSspMYUflXIHFZUGdLjORHLPJr4VRL3M9yEwDDUwD4bNmB8J6TroVOdAo/JMFfoTXVmfty6PjwjL6Cr+hw3Qwh/a51mbRKTqPky105gBOXzToOD7Dr7bU4jR9hDP4iP7FRf9KH3fiBD4kZXQ4fOEgjuBQLo7AavwWb+LADjqNIzUVESfObyo8Jus4icPb/Hgd6FD71iC9H9+0wdGWU+vFV914bxd+W1aDj/F7P52nM+wYpQ9P0hsX6fTmA/QencPv2xP4N3xOn9IJfEpv0j/TvoPYT6dwkE7QV/TJ6T05eC0Pr0v0koqX8XN3aQ+dpZP7cYI+WIMT2bS/oYDVBY5zNJhYlLgbmvohdByazaaesL0t0JMiCL9AoKckmQ2G0+86GD6RZKblT5fQsijJTBpnZJ8kM/DPcnTdvn3we4nP3nCGF7IwEiQRWjjq5s9AWJKZKCIyE4P8hgKIfFA4sEi+bqgaG6aOzg7HG9tzhNUOZ0yN6s6fDOVDbBZrRsAFraYiolwRMleEsqQi2kRoT2dd7IZYOuuFkNeDrEOE+GLW6hJaSIgAQsF/DBdTmzElTvS3haH9vPp0EQy5Oi2gpJwrNwr/WlBC+oHjdKjOlMHGFQyxF/QK3kpalQv4AJ8y4IdxILymCB/tbMRhOoLzn0CYflZgMOi/pFfolXCl/wx9uMKHV+ksix9R6NjRw9l0uAT78HlolaMRp+i0IGVvD+B3eKfpDPbi/Y14uxMfH8Yn9DGDe5hO4uUTtA9/3Mo0Oap90IMP6CjL4wX7zR/Y6QPY25JNb9OXHpwJ4F2mihnad74IX+KzVGt+vICOnmFbeErHaN8OOrsK762kgxsEOoi3CnG8VDlxYcuuIN6ik3Qg2oWPvKYwOmWmhq7j0M3nSM8bc+Tc4hw5z+bIMbPjf7CZcKZMOJlOLshpFVx8VxX0chV8Jouc2r4MtSY7VRUhuMSgPf3/QZvxWT2VbnYwP3v3FLArhBQ420564HMT6csc6X6O9JUlkb4qwhe8LPu8MPBfiL4mwvVFogeXIPqGCF/+L0QP/TzRwyKMLBA9yonu/ynRVjpQmoO9Fgb0EM7Tv+McnZZLGLIM5014KXgEp94PGvQZ9jEqntO71XjnU7yKTw514I9RnF3+Id7AuRz8Grw4GOvJX49/aqAzOMbCv6vG6SN0tpa+zGEXhjl8sQ7nogeceQE6htMN5qVpzERr/Djc5GhNvIHWrUW0vlpA6zZH604GrT9m0Lr7rmhNcrTuZdCaegutaYaW/q5ovcanoFWJsb8DAauvzOrxxEgniVlN2Mr3mB/eCqs3uGD7yva6rL4ge8HXySThN6vsDl4374Tu12YnHf8GcstbzQ== \ No newline at end of file diff --git a/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz b/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz deleted file mode 100644 index dc115941..00000000 --- a/tests/client/cassetes/test_logs/test_logs_iter_raw_msgpack.gz +++ /dev/null @@ -1 +0,0 @@ -eJyllv1TFGcSx5GoIfieixoNXlaSxT1l32UhGEyQd1F5W5LRc9Rh92FnYNnZnpflRTeGGGIiEhNEPSWK5IQ7zouVGMvEnFWp6qeu6v6i+/l6ngVBi1Qsb3Z2t7q3Z55+uj/f3vk4Pwt5HmltXl6elrKYocQsTU+ZsEKGfE8WXvJIBfSTwcw0eRmszMIqj/QyuVSmxJlhwuosvOyRVpKnVrEYFMjwirSDrHpDK3WF9roOK4YrFAiWu4LhynBFZbDM1XA4CoWKtJqCOpiRYQaskWGtVET2hx0NTTmfP+ALuWpaBy1VT/nDvr2+CKxTxDIfKMYgrJdhg7SerOpYjKWtUleNrvdqDDYq0irHm0zq/bBJhleldWS2tnRES10trdGmliMd8AdFbLdGp/2mLG90MM3gNRk2SxvJq6TTSS2mOEXw95h6CrYotli2S48PwtYsvC4SNy1DSyVgm7SLjNPFpqVYtllcWaz3FpcW9+hdWpyMUO7wB/1lkUhxFrab89c6wfBGFopypexjpqkkGOyQ8slqaYY/ihVjepzBm83/sW0RZTCwmWmBKws7PepWyU2+tKH3sJhVNb9SiZnWqClVqum1KNSbM6FYpSa95RGlqLapooY2JHYIb8vglkLkPqCYWsx1rO95X0cjUKJIhXRlp8kMb3WCKgm7ZPCI3qdF27xmzFDSVCbV7vKHfAF6xVkmAH9SpA1POuetS9E+nVrulmGPSDExpKVLXXHWnXSAKs2RMt9frww+qdkpjml0W3ovS4WqumqV2sMDkc5Ug9ZfXn60ZkCLxnu6+psaglZZ0qrQYwerIdrXa8Y7a6JN5Y1th5J6d8YIpOsYi0STFa3gXwaIgAxBaeczQAx4+/v7vd260ee1jSRzMmdxCOUKQZenmFAPhGXYK3y9jKW9SlLLMChTBK8LixxiqYSlQkSGctH1cDlU5Haaqwu8I0Ol9BLZu/27YR9BuFqQQoWNw7uCD4dqqBIxtqHBfultR5aWla70+8NhX+4sC1ZWBCoCfmqE37BTPsH0e7adhfc9Kqm52uPAccCzXHI1MtRKK8gfgDpFemM+N9P0OnGGnvQKmXlbDC2hpaBehgYRvRsalylokwwHpW3ktdiA5U8nFS21zxVTFcNkVlVntN5bAc2KrZLCDqnbJIrLg8OmSiI54lGfCKNFFXpQSQOtpIHYSacDMbMw14pCaJP+S4Fv4mhBXuNOfkXK57/iDP+ajxXyc661efz2aZzCb/kdfIzD2/ld/k9+C2djeL2gB89n+CRO8wf8Jr92gj98i3+3McMv8ksNr4b4J6eq+DD+wGfxPsO/46c9OFnPp/He6oZ+wzyB37/Dv8eJ9gRejTTyqyd38O+yoSKc7TnDf8RzfHwtPuIThfamrXzqTLaST9HyP/MRPrPhI5zll/lYKf6ykv8N7/LJNJ9r9nbzh5TxyCm69BGex2vH8PpRvIA/4CU/f9z2umcdfsWn8F8U9SXODeWfLcGRTAnVB9oFIiQaLRWEjs8g2g6dTnM/8Dwr1A9lkJ5DqEcVlQbEsRcdEH9WVNL08WU0LSsqSeSEWqaoJICTAuFgJAynFDGDY/PcsIXRoMjQJZB3bgMxRSVxxFUShbpECUzdL727qADT0g0aqr5Mn2/J9nwxvc+f1BOm/6nh/B7NZMOqCkC3o4yEUIYqlKEtq4weGXpzWYeDkMxlvRDyZKD1yZBazFpfRhNpGUCo9ukh02cm0kqs91l5GL+vQVMGS23MychWt6lt0r8X9JA7ek+04zhex8khvIXDOBzEGwT9j2f5T3wcv+ETeJ9PFOE9PoOPcI4gHcEv+BzO4V0cDZtrjHbbxFF+Gcdwsi3KH+JtnDtOdxvHG1G8sb4dP+/EmwWH+E28wC+Sxh6coXte4Hf47S14bdcWPsP4LJ+ObMR7xqpG9xG2Pc1/wpktm/nMVpzGf6zBXzfh433r+Vi2+WA7jjbn0boPPHhlM40VyKjEdH87DIipMLhkKgwtToXTNBWOOH0743EQyzqIEe0fqTmWz74oyx8LlodVWbD3ibT5t5oG5wSVDhsNdVH4lKA89v9BOe9zh+qD9CY/fYZK6J9AqTrZywZhxEH2M4HseYHs58si+4UMF0TBIgEY/Q1iL8owtkjsl8sQe0mGr56f2K9/n9hxGS4vEDshiD3+nMReWkrs2LPE7unAUaLwPo5aAtkV+4v4yDqP87dyxQHpajtcEyD9ZQlI1xdBurEA0qQA6Zt5kG7Og3TrRUGaEiDdngdpWn0al28JF/NFcXmCREm3lqRH+Sp32QF3KJRkGZZ0OwDV7HG+wnXucO2CXXZgb8BdVksn/NW2mXjQpKdv03mKudO8wvb9D/2gOpI= \ No newline at end of file diff --git a/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz b/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz deleted file mode 100644 index 85e51ebd..00000000 --- a/tests/client/cassetes/test_requests/test_requests_iter_raw_json.gz +++ /dev/null @@ -1 +0,0 @@ -eJyllvt3FFUSx4cgD2PAALKIBpgMCEPMPPMaw0bJOyGEBDLBDtILnZmbdCeT7ql+5AGOxABRBOUVyHFXObIGc6K7sj4OKiCeU/WLf9ZW94THYs7R407PD1M11ffWrfp+qnuyIAe+oFTk8/k03RamkrI1Q7dgmQwFwRwsD0qr+S9TWFn2CngmByuC0ip2qUJJC9OClTlYFZSeYU+TYgtYLcOz0ha2Wkyt3B+v9Hcqpj8ejdX4YxW1Fa/VxmL+1s4kFCrSSg7qEeaoMOE5GYqkErbf7Gltz/si0XDc39g9YauGHqkIV4arYY3ibXNYMSdgrQzPS2vZqk+lRNYu9zcaxrAmoFiRVrjeTMYYg3UyrJfWsNnd1ZMs93d1J9u7DvTABsU7bqPB59XtUHIiK+AFGTZKxexVstmMllLcIkSGLEOHvyiOt22/kZ6ATTl40Uvcsk1NH4TN0i42TgYsW7EdK1AbMIYD5YEho19LsxHPfyKxSFUiEcjBS9bivW4wvJyDknwpR4RlKYMCtkgFbHV1wFZvx5SRFrCt41fH8aJMAY6wbPDnoDSobpJeYV/WNIZEyq5b3GmnldW4KXWqFbI5NJQ3IaByk7YHvVLUO1xRUzvhnRB2yPCKFGd3g2JpKf+RkT969VXDTkUq5Dt7LWGG6ge5krBLhqDX+6zXtpCVMpUsl0l1+iPxcJSvtBiNwm5Fev5R50LNOp/TrWWZDK96KQ6e0LLl/rQYyLiCKs8rZbG/IRnCUodbHMscsI1hocfr+puUps7x6l69VRurqelrHNeS6aH+sfbWmF2VsRNGal89JEeGrXRvY7K9pu3g/owxMGpGs81CVCcziW6ILCGIqAwxqfQpQYyHxsbGQgOGORJyzIxwMxdpiOcLwbfrwqMHKmSo9HzDQmRDSkYbFVCleHp9uMl+oQ/aKlTLUON1vaIGEvmT5usCr8lQKy1nuyxSBntYhCs9pXBh0/BXTx+uqqHOi3FMDV6XdrhY2na2NhKpqAjnv1Wx2kQ0EY1wIyKmo4c9Tb/hODnYG1SZ5vqgK46G4FLJNcrQJC1jfxSaFenlxdwsK+TGmUYm5GEW6jK1QU2HFhlavegyaFuioO0y7JM2s9cW43Ykm1E0fY8/pSqmJey63mRLKAEdiqMyYfvVzRLH+aDTUhmSA0H1ERhdqseDygx0MwOpY24HUlZhvhWFcFB6jjPYhudX+3CylK5JBfQLXqOPOtbiVLR4JX2GP+O8H7+rjuMt9s/SDfyRZprqfHj5pTa1VODdAN2gT2mhBc/tw3/iPfqJ5vGaUTCEH+J0fH1Zh4UzDXQLF+gCza4r4jWu4vXx4zmaBbqDX9PnvjV0uZumY7SAl6vdDegMTW6vwemTuyvwa4NO01c454by6nfwB/qEPjZLcCGw7fCGJvyopYeu8k03af6dMZyme0N0d+du+mL90YEdXavC2+lMCd6kG8X0Hp3W2vnnf1rxLN1P4mQbzuHUqRKcpclxH02KU/Q5znIp4JCnGyZJ02PQMw3JQ9Drdvxw8Gl635RB+gP09ikqT40jf3ZqvKWoDPrRJUCXFZW5+ZtapahMxTFP17GaBBxXvMGcWhSTeDgvFBn6PQ7cZSClqExMWmVS1CfwEOrr0t7HWFi2YfKkDY+OhJ84XjhljEQWx6sV+Z+p/QYPa9Oui8KAi8ygh4zqIaMticyQDMP5zBNVkMln/jDk0aQbkUF/nLmxBCxZGUDa+Zvp48Kb0XRhPU2O+ft4WjLYalueMEfdrB6Utj5EJf9xOaHzhTi19wUGJYcX6Ru8je/RF/jlAfwOL+L1FXStdscunKFvcWrrplJ8cHL5uo1ButxHD2ie7qxJyPTzlrX0Zd9R+pbePxKmyb3NtEA3ca69/1X8vrKaruP1UvqK/oEz+AHexSk8/2LtPobpNpM5U1zEUM7jfU5jCm/Rh3S3GO9w7KUO/PcGnMYHDO/Zyl1DJbzZ3J4jeINRupI6WsQI/kK3aI5ub2zEq0zVg0CCvqFzeMlghn7qp3/xMvdwtpEulNIM3m/o3I0X6Me3gO7jObqk0Gd0Bc/gdJL+fvxtH10cxnN4Fq+49IyqjMzYIRj3JtHEE5PoxONJdJIn0QFXEm8HXQXnXAUzTO+oeVRO/VlUJj1U3lVlT9pTi9J2xdXanITTrOyG/0fZ/CxR6o4Niwk44zjCe8Xg9y7LfX6d7VjmhP8LWciWDQ== \ No newline at end of file diff --git a/tests/client/test_items.py b/tests/client/test_items.py index 38af756a..1dfaeaf1 100644 --- a/tests/client/test_items.py +++ b/tests/client/test_items.py @@ -1,10 +1,6 @@ -import json - import pytest from six.moves import range -from scrapinghub.hubstorage.serialization import mpdecode - def _add_test_items(job): for i in range(3): @@ -29,18 +25,6 @@ def test_items_iter(spider): with pytest.raises(StopIteration): next(o) - o = job.items.iter_raw_json(offset=2) - item = json.loads(next(o)) - assert item['id'] == 2 - assert item['data'] == 'data2' - with pytest.raises(StopIteration): - next(o) - - msgpacked_o = job.items.iter_raw_msgpack(offset=2) - o = mpdecode(msgpacked_o) - assert item['id'] == 2 - assert item['data'] == 'data2' - def test_items_list(spider): job = spider.jobs.run(meta={'state': 'running'}) diff --git a/tests/client/test_logs.py b/tests/client/test_logs.py index 52b42ec3..88cb4b85 100644 --- a/tests/client/test_logs.py +++ b/tests/client/test_logs.py @@ -1,11 +1,9 @@ -import json import types from numbers import Integral import pytest from scrapinghub.client.utils import LogLevel -from scrapinghub.hubstorage.serialization import mpdecode from .conftest import TEST_TS @@ -103,35 +101,3 @@ def test_logs_list_filter(spider): logs3 = job.logs.list(filter=[('message', 'contains', ['simple'])]) assert len(logs3) == 3 - - -def test_logs_iter_raw_json(spider): - job = spider.jobs.run() - _add_test_logs(job) - - logs0 = job.logs.iter_raw_json(offset=2) - raw_log0 = next(logs0) - log0 = json.loads(raw_log0) - assert log0.get('message') == 'simple-msg3' - assert log0.get('_key') - assert isinstance(log0.get('time'), Integral) - assert log0.get('level') == 10 - - logs1 = job.logs.iter_raw_json(level='ERROR') - raw_log1 = next(logs1) - log1 = json.loads(raw_log1) - assert log1.get('message') == 'error-msg' - - -def test_logs_iter_raw_msgpack(spider): - job = spider.jobs.run() - _add_test_logs(job) - - logs1 = job.logs.iter_raw_msgpack(offset=2) - assert isinstance(logs1, types.GeneratorType) - unpacked_logs1 = list(mpdecode(logs1)) - assert unpacked_logs1[0].get('message') == 'simple-msg3' - - logs2 = job.logs.iter_raw_msgpack(level='ERROR') - unpacked_logs2 = list(mpdecode(logs2)) - assert unpacked_logs2[0].get('message') == 'error-msg' diff --git a/tests/client/test_requests.py b/tests/client/test_requests.py index 1d2e3bca..a71b3820 100644 --- a/tests/client/test_requests.py +++ b/tests/client/test_requests.py @@ -1,5 +1,3 @@ -import json - import pytest from .conftest import TEST_TS @@ -39,18 +37,3 @@ def test_requests_iter(spider): } with pytest.raises(StopIteration): next(rr) - - -def test_requests_iter_raw_json(spider): - job = spider.jobs.run() - _add_test_requests(job) - job.requests.close() - - rr = job.requests.iter_raw_json() - raw_req = next(rr) - req = json.loads(raw_req) - assert req.get('url') == 'http://test.com/' - assert req.get('status') == 200 - next(rr), next(rr) - with pytest.raises(StopIteration): - next(rr) From efa665ae4a1921d1eba65ede3803110b23ee7867 Mon Sep 17 00:00:00 2001 From: Viktor Shlapakov Date: Wed, 29 Mar 2017 11:52:26 +0300 Subject: [PATCH 38/40] Minor docstring fixes: links and formatting --- scrapinghub/client/__init__.py | 4 ++-- scrapinghub/client/collections.py | 4 ++-- scrapinghub/client/frontiers.py | 20 +++++--------------- scrapinghub/client/items.py | 6 +++--- scrapinghub/client/jobs.py | 21 +++++++++++---------- scrapinghub/client/logs.py | 2 +- scrapinghub/client/requests.py | 2 +- 7 files changed, 25 insertions(+), 34 deletions(-) diff --git a/scrapinghub/client/__init__.py b/scrapinghub/client/__init__.py index 0cf5d080..5e9fbafa 100644 --- a/scrapinghub/client/__init__.py +++ b/scrapinghub/client/__init__.py @@ -70,9 +70,9 @@ def get_project(self, project_id): return self.projects.get(parse_project_id(project_id)) def get_job(self, job_key): - """Get Job with a given job key. + """Get :class:`~scrapinghub.client.jobs.Job` with a given job key. - :param job_key: job key string in format 'project_id/spider_id/job_id', + :param job_key: job key string in format ``project_id/spider_id/job_id``, where all the components are integers. :return: a job instance. :rtype: :class:`~scrapinghub.client.jobs.Job` diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index 960f0bfd..afa72917 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -27,7 +27,7 @@ class Collections(_Proxy, _DownloadableProxyMixin): def get(self, type_, name): """Base method to get a collection with a given type and name. - :param type_: a collection type string. + :param `type_`: a collection type string. :param name: a collection name string. :return: a collection object. :rtype: :class:`Collection` @@ -248,7 +248,7 @@ def create_writer(self, start=0, auth=None, size=1000, interval=15, :param maxitemsize: (optional) max item size in bytes. :param callback: (optional) some callback function. :return: a new writer object. - :rtype: :class:`~scrapinghub.hubstorage.batchuploader._BatchWriter` + :rtype: :class:`scrapinghub.hubstorage.batchuploader._BatchWriter` If provided - calllback shouldn't try to inject more items in the queue, otherwise it can lead to deadlocks. diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index a6c2f30e..f4d63887 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -113,11 +113,7 @@ def list(self): @property def newcount(self): - """Amount of new entries added to all frontiers. - - :return: amount of new entries. - :rtype: :class:`int` - """ + """Integer amount of new entries added to all frontiers.""" return sum(self._origin.newcount.values()) def flush(self): @@ -199,11 +195,7 @@ def flush(self): @property def newcount(self): - """Amount of new entries added to frontier. - - :return: amount of new entries. - :rtype: :class:`int` - """ + """Integer amount of new entries added to frontier.""" newcount_values = self._frontiers._origin.newcount return sum(v for (frontier, _), v in newcount_values.items() if frontier == self.key) @@ -298,16 +290,13 @@ def flush(self): @property def newcount(self): - """Amount of new entries added to slot. - - :return: amount of new entries. - :rtype: :class:`int` - """ + """Integer amount of new entries added to slot.""" newcount_values = self._frontier._frontiers._origin.newcount return newcount_values.get((self._frontier.key, self.key), 0) class FrontierSlotFingerprints(object): + """Representation of request fingerprints collection stored in slot.""" def __init__(self, slot): self.key = slot.key @@ -350,6 +339,7 @@ def list(self, **params): class FrontierSlotQueue(object): + """Representation of request batches queue stored in slot.""" def __init__(self, slot): self.key = slot.key diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 45d0e7d1..0ffeedfb 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -6,9 +6,9 @@ class Items(_ItemsResourceProxy, _DownloadableProxyMixin): """Representation of collection of job items. - Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` instanc - e to get a :class:`Items` instance. - See :attr:`~scrapinghub.client.jobs.Job.items` attribute. + Not a public constructor: use :class:`~scrapinghub.client.jobs.Job` + instance to get a :class:`Items` instance. See + :attr:`~scrapinghub.client.jobs.Job.items` attribute. Please note that :meth:`list` method can use a lot of memory and for a large amount of logs it's recommended to iterate through it via diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index a022e85e..1996264e 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -24,7 +24,7 @@ class Jobs(object): and :attr:`scrapinghub.client.spiders.Spider.jobs` attributes. :ivar project_id: a string project id. - :ivar spider: :class:`Spider` object if defined. + :ivar spider: :class:`~scrapinghub.client.spiders.Spider` object if defined. Usage:: @@ -112,16 +112,16 @@ def iter(self, count=None, start=None, spider=None, state=None, >>> [job['key'] for job in jobs_summary] ['123/1/3', '123/1/2', '123/1/1'] - - job summary fieldset is less detailed than job.metadata but contains - few new fields as well. Additional fields can be requested using - ``meta`` parameter. If it's used, then it's up to the user to list - all the required fields, so only few default fields would be added - except requested ones:: + - job summary fieldset is less detailed than :class:`JobMeta` but + contains a few new fields as well. Additional fields can be requested + using ``meta`` parameter. If it's used, then it's up to the user to + list all the required fields, so only few default fields would be + added except requested ones:: >>> jobs_summary = project.jobs.iter(meta=['scheduled_by', ]) - by default :meth:`Jobs.iter` returns maximum last 1000 results. - Pagination is available using start parameter:: + Pagination is available using start parameter:: >>> jobs_summary = spider.jobs.iter(start=1000) @@ -227,13 +227,14 @@ def run(self, spider=None, units=None, priority=None, meta=None, return Job(self._client, response['jobid']) def get(self, job_key): - """Get a Job with a given job_key. + """Get a :class:`Job` with a given job_key. :param job_key: a string job key. job_key's project component should match the project used to get :class:`Jobs` instance, and job_key's spider component should match - the spider (if :attr:`Spider.jobs` was used). + the spider (if :class:`~scrapinghub.client.spiders.Spider` was used + to get :class:`Jobs` instance). :return: a job object. :rtype: :class:`Job` @@ -509,7 +510,7 @@ class JobMeta(_MappingProxy): """Class representing job metadata. Not a public constructor: use :class:`Job` instance to get a - :class:`JobMeta` instance. See :attr:`Job.metadata` attribute. + :class:`JobMeta` instance. See :attr:`~Job.metadata` attribute. Usage: diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 130e5720..261934a9 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -54,7 +54,7 @@ def log(self, message, level=logging.INFO, ts=None, **other): :param message: a string message. :param level: (optional) logging level, default to INFO. - :param ts: (optional) unix timestamp in milliseconds. + :param ts: (optional) UNIX timestamp in milliseconds. :param \*\*other: other optional kwargs. """ self._origin.log(message, level=level, ts=ts, **other) diff --git a/scrapinghub/client/requests.py b/scrapinghub/client/requests.py index 13acf6e6..7f5428ef 100644 --- a/scrapinghub/client/requests.py +++ b/scrapinghub/client/requests.py @@ -50,7 +50,7 @@ def add(self, url, status, method, rs, parent, duration, ts, fp=None): :param rs: response body length. :param parent: parent request id or ``None``. :param duration: request duration in milliseconds. - :param ts: unix timestamp in milliseconds. + :param ts: UNIX timestamp in milliseconds. :param fp: (optional) string fingerprint for the request. """ return self._origin.add( From 4acc40efcb5fd6fc6a952082de0b334163a72a10 Mon Sep 17 00:00:00 2001 From: Alexander Chekunkov Date: Wed, 29 Mar 2017 11:35:14 +0100 Subject: [PATCH 39/40] Fix Python examples in documentation --- docs/client/overview.rst | 16 ++++++++-------- docs/legacy/hubstorage.rst | 10 +++++----- docs/quickstart.rst | 2 +- scrapinghub/client/activity.py | 10 +++++----- scrapinghub/client/collections.py | 4 ++-- scrapinghub/client/items.py | 2 +- scrapinghub/client/jobs.py | 2 +- scrapinghub/client/logs.py | 2 +- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/client/overview.rst b/docs/client/overview.rst index 6a9c2c28..126e4fee 100644 --- a/docs/client/overview.rst +++ b/docs/client/overview.rst @@ -145,8 +145,8 @@ Scheduling logic supports different options, like For example, to run a new job for a given spider with custom params:: - >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, - priority=1, add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) + >>> job = spider.jobs.run(units=2, job_settings={'SETTING': 'VALUE'}, priority=1, + ... add_tag=['tagA','tagB'], meta={'custom-data': 'val1'}) Note that if you run a job on project level, spider name is required:: @@ -192,7 +192,7 @@ ones:: >>> job_summary = next(project.jobs.iter()) >>> job_summary.get('spider', 'missing') 'foo' - >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by', ]) + >>> jobs_summary = project.jobs.iter(jobmeta=['scheduled_by']) >>> job_summary = next(jobs_summary) >>> job_summary.get('scheduled_by', 'missing') 'John' @@ -227,10 +227,10 @@ for filtering by state: - finished - deleted -Dict entries returned by ``iter`` method contain some additional meta, +Dictionary entries returned by ``iter`` method contain some additional meta, but can be easily converted to :class:`~scrapinghub.client.jobs.Job` instances with:: - >>> [Job(x['key']) for x in jobs] + >>> [Job(client, x['key']) for x in jobs] [ , , @@ -422,9 +422,9 @@ To post a new activity event:: Or post multiple events at once:: >>> events = [ - {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, - {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, - ] + ... {'event': 'job:completed', 'job': '123/2/5', 'user': 'john'}, + ... {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, + ... ] >>> project.activity.add(events) diff --git a/docs/legacy/hubstorage.rst b/docs/legacy/hubstorage.rst index 5a024faa..d1e06ea4 100644 --- a/docs/legacy/hubstorage.rst +++ b/docs/legacy/hubstorage.rst @@ -130,7 +130,7 @@ If it used, then it's up to the user to list all the required fields, so only fe >>> metadata = next(project.jobq.list()) >>> metadata.get('spider', 'missing') u'foo' - >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by', ]) + >>> jobs_metadata = project.jobq.list(jobmeta=['scheduled_by']) >>> metadata = next(jobs_metadata) >>> metadata.get('scheduled_by', 'missing') u'John' @@ -150,7 +150,7 @@ List of tags has ``OR`` power, so in the case above jobs with 'new' or 'verified To get certain number of last finished jobs per some spider:: - >>> jobs_metadata = project.jobq.list(spider='foo', state='finished' count=3) + >>> jobs_metadata = project.jobq.list(spider='foo', state='finished', count=3) There are 4 possible job states, which can be used as values for filtering by state: @@ -167,7 +167,7 @@ To iterate through items:: >>> items = job.items.iter_values() >>> for item in items: - # do something, item is just a dict + ... # do something, item is just a dict Logs ^^^^ @@ -176,7 +176,7 @@ To iterate through 10 first logs for example:: >>> logs = job.logs.iter_values(count=10) >>> for log in logs: - # do something, log is a dict with log level, message and time keys + ... # do something, log is a dict with log level, message and time keys Collections ^^^^^^^^^^^ @@ -246,4 +246,4 @@ Module contents :undoc-members: :show-inheritance: -.. _scrapinghub.ScrapinghubClient: ../client/overview.html +.. _scrapinghub.ScrapinghubClient: ../client/overview.html diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 3fced7e0..426e0475 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -36,7 +36,7 @@ Work with your projects:: Run new jobs from the client:: >>> project = client.get_project(123) - >>> project.jobs.run('spider1', job_args={'arg1':'val1'}) + >>> project.jobs.run('spider1', job_args={'arg1': 'val1'}) > Access your jobs data:: diff --git a/scrapinghub/client/activity.py b/scrapinghub/client/activity.py index 665ee3b6..b5d1777f 100644 --- a/scrapinghub/client/activity.py +++ b/scrapinghub/client/activity.py @@ -31,16 +31,16 @@ class Activity(_Proxy): - post a new event:: >>> event = {'event': 'job:completed', - 'job': '123/2/4', - 'user': 'jobrunner'} + ... 'job': '123/2/4', + ... 'user': 'jobrunner'} >>> project.activity.add(event) - post multiple events at once:: >>> events = [ - {'event': 'job:completed', 'job': '123/2/5', 'user': 'jobrunner'}, - {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, - ] + ... {'event': 'job:completed', 'job': '123/2/5', 'user': 'jobrunner'}, + ... {'event': 'job:cancelled', 'job': '123/2/6', 'user': 'john'}, + ... ] >>> project.activity.add(events) """ diff --git a/scrapinghub/client/collections.py b/scrapinghub/client/collections.py index afa72917..10f78e2e 100644 --- a/scrapinghub/client/collections.py +++ b/scrapinghub/client/collections.py @@ -108,7 +108,7 @@ class Collection(object): - add a new item to collection:: >>> foo_store.set({'_key': '002d050ee3ff6192dcbecc4e4b4457d7', - 'value': '1447221694537'}) + ... 'value': '1447221694537'}) - count items in collection:: @@ -128,7 +128,7 @@ class Collection(object): - iterate iterate over _key & value pair:: >>> for elem in foo_store.iter(count=1)): - >>> ... print(elem) + ... print(elem) [{'_key': '002d050ee3ff6192dcbecc4e4b4457d7', 'value': '1447221694537'}] - filter by multiple keys, only values for keys that exist will be returned:: diff --git a/scrapinghub/client/items.py b/scrapinghub/client/items.py index 0ffeedfb..c3d5828a 100644 --- a/scrapinghub/client/items.py +++ b/scrapinghub/client/items.py @@ -25,7 +25,7 @@ class Items(_ItemsResourceProxy, _DownloadableProxyMixin): - iterate through first 100 items and print them:: >>> for log in job.logs.iter(count=100): - >>> ... print(log) + ... print(log) - retrieve items with timestamp greater or equal to given timestamp (item here is an arbitrary dictionary depending on your code):: diff --git a/scrapinghub/client/jobs.py b/scrapinghub/client/jobs.py index 1996264e..f6813976 100644 --- a/scrapinghub/client/jobs.py +++ b/scrapinghub/client/jobs.py @@ -540,7 +540,7 @@ class JobMeta(_MappingProxy): - update multiple meta fields at once - >>> job.metadata.update({'my-meta1': 'test1', 'my-meta2': 'test2}) + >>> job.metadata.update({'my-meta1': 'test1', 'my-meta2': 'test2'}) - delete meta field by name:: diff --git a/scrapinghub/client/logs.py b/scrapinghub/client/logs.py index 261934a9..2c68d800 100644 --- a/scrapinghub/client/logs.py +++ b/scrapinghub/client/logs.py @@ -28,7 +28,7 @@ class Logs(_ItemsResourceProxy, _DownloadableProxyMixin): - iterate through first 100 log entries and print them:: >>> for log in job.logs.iter(count=100): - >>> ... print(log) + ... print(log) - retrieve a single log entry from a job:: From 350472e456cfc2c8e578f237e82e8892f0d255ce Mon Sep 17 00:00:00 2001 From: Alexander Chekunkov Date: Wed, 29 Mar 2017 11:44:53 +0100 Subject: [PATCH 40/40] Fix typo in Frontier.iter() rtype --- scrapinghub/client/frontiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapinghub/client/frontiers.py b/scrapinghub/client/frontiers.py index f4d63887..72f4edd4 100644 --- a/scrapinghub/client/frontiers.py +++ b/scrapinghub/client/frontiers.py @@ -174,7 +174,7 @@ def iter(self): """Iterate through slots. :return: an iterator over frontier slots names. - :rtype: :class:`collections.Iterate[str]` + :rtype: :class:`collections.Iterable[str]` """ return iter(self.list())