Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,19 @@ spider_name

url
- type: string
- required
- required if start_requests not enabled

Absolute URL to send request to. URL should be urlencoded so that
querystring from url will not interfere with api parameters.

By default API will crawl this url and won't execute any other requests.
Most importantly it will not execute ``start_requests`` and spider will
not visit urls defined in ``start_urls`` spider attribute. There will be
only one single request scheduled in API - request for resource identified
by url argument.

If you want to execute request pass start_requests argument.

callback
- type: string
- optional
Expand All @@ -73,13 +81,26 @@ max_requests
- type: integer
- optional

Maximal amount of requests spider can generate. E.g. if it is set to ``1``
Maximum amount of requests spider can generate. E.g. if it is set to ``1``
spider will only schedule one single request, other requests generated
by spider (for example in callback, following links in first response)
will be ignored. If your spider generates many requests in callback
and you don't want to wait forever for it to finish
you should probably pass it.

start_requests
- type: boolean
- optional

Whether spider should execute ``Scrapy.Spider.start_requests`` method.
``start_requests`` are executed by default when you run Scrapy Spider
normally without ScrapyRT, but this method is NOT executed in API by
default. By default we assume that spider is expected to crawl ONLY url
provided in parameters without making any requests to ``start_urls``
defined in ``Spider`` class. start_requests argument overrides this
behavior. If this argument is present API will execute start_requests
Spider method.

If required parameters are missing api will return 400 Bad Request
with hopefully helpful error message.

Expand Down
10 changes: 5 additions & 5 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
-r requirements.txt

fabric
requests
mock
pytest
pytest-cov
port-for
requests==2.9.1
mock==1.3.0
pytest==2.9.1
pytest-cov==2.2.1
port-for==0.3.1
21 changes: 8 additions & 13 deletions scrapyrt/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ class CrawlManager(object):
Runs crawls
"""

def __init__(self, spider_name, request_kwargs, max_requests=None):
def __init__(self, spider_name, request_kwargs, max_requests=None, start_requests=False):
self.spider_name = spider_name
self.log_dir = settings.LOG_DIR
self.items = []
Expand All @@ -145,8 +145,11 @@ def __init__(self, spider_name, request_kwargs, max_requests=None):
# callback will be added after instantiation of crawler object
# because we need to know if spider has method available
self.callback_name = request_kwargs.pop('callback', None) or 'parse'
self.request = self.create_spider_request(deepcopy(request_kwargs))
self.start_requests = False
if request_kwargs.get("url"):
self.request = self.create_spider_request(deepcopy(request_kwargs))
else:
self.request = None
self.start_requests = start_requests
self._request_scheduled = False

def crawl(self, *args, **kwargs):
Expand Down Expand Up @@ -189,7 +192,7 @@ def spider_idle(self, spider):
which is totally wrong.

"""
if spider is self.crawler.spider and not self._request_scheduled:
if spider is self.crawler.spider and self.request and not self._request_scheduled:
callback = getattr(self.crawler.spider, self.callback_name)
assert callable(callback), 'Invalid callback'
self.request = self.request.replace(callback=callback)
Expand Down Expand Up @@ -263,15 +266,7 @@ def create_spider_request(self, kwargs):
try:
req = Request(url, **kwargs)
except (TypeError, ValueError) as e:
# Bad arguments for scrapy Request
# we don't want to schedule spider if someone
# passes meaingless arguments to Request.
# We must raise this here so that this will be returned to client,
# Otherwise if this is raised in spider_idle it goes to
# spider logs where it does not really belong.
# It is needed because in POST handler we can pass
# all possible requests kwargs, so it is easy to make mistakes.
message = "Error while creating Request, {}".format(e.message)
message = "Error while creating Scrapy Request, {}".format(e.message)
raise Error('400', message=message)

req.dont_filter = True
Expand Down
87 changes: 48 additions & 39 deletions scrapyrt/resources.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# -*- coding: utf-8 -*-
import demjson
from scrapy.utils.misc import load_object
from scrapy.utils.serialize import ScrapyJSONEncoder
from twisted.internet.defer import Deferred
from twisted.python.failure import Failure
from twisted.web import server, resource
from twisted.web.error import UnsupportedMethod, Error
import demjson
from twisted.web import resource, server
from twisted.web.error import Error, UnsupportedMethod

from . import log
from .conf import settings
from .utils import extract_scrapy_request_args


class ServiceResource(resource.Resource, object):
Expand Down Expand Up @@ -110,24 +111,14 @@ def render_GET(self, request, **kwargs):
At the moment kwargs for scrapy request are not supported in GET.
They are supported in POST handler.
"""
request_data = dict(
api_params = dict(
(name.decode('utf-8'), value[0].decode('utf-8'))
for name, value in request.args.items()
)

spider_data = {
'url': self.get_required_argument(request_data, 'url'),
# TODO get optional Request arguments here
# distinguish between proper Request args and
# api parameters
}
try:
callback = request_data['callback']
except KeyError:
pass
else:
spider_data['callback'] = callback
return self.prepare_crawl(request_data, spider_data, **kwargs)
scrapy_request_args = extract_scrapy_request_args(api_params,
raise_error=False)
self.validate_options(scrapy_request_args, api_params)
return self.prepare_crawl(api_params, scrapy_request_args, **kwargs)

def render_POST(self, request, **kwargs):
"""
Expand All @@ -147,66 +138,85 @@ def render_POST(self, request, **kwargs):
"""
request_body = request.content.getvalue()
try:
request_data = demjson.decode(request_body)
api_params = demjson.decode(request_body)
except ValueError as e:
message = "Invalid JSON in POST body. {}"
message.format(e.pretty_description())
raise Error('400', message=message)

log.msg("{}".format(request_data))
spider_data = self.get_required_argument(request_data, "request")
error_msg = "Missing required key 'url' in 'request' object"
self.get_required_argument(spider_data, "url", error_msg=error_msg)
log.msg("{}".format(api_params))
if api_params.get("start_requests"):
# start requests passed so 'request' argument is optional
_request = api_params.get("request", {})
else:
# no start_requests, 'request' is required
_request = self.get_required_argument(api_params, "request")
try:
scrapy_request_args = extract_scrapy_request_args(
_request, raise_error=True
)
except ValueError as e:
raise Error(400, e.message)

self.validate_options(scrapy_request_args, api_params)
return self.prepare_crawl(api_params, scrapy_request_args, **kwargs)

return self.prepare_crawl(request_data, spider_data, **kwargs)
def validate_options(self, scrapy_request_args, api_params):
url = scrapy_request_args.get("url")
start_requests = api_params.get("start_requests")
if not url and not start_requests:
raise Error(400,
"'url' is required if start_requests are disabled")

def get_required_argument(self, request_data, name, error_msg=None):
def get_required_argument(self, api_params, name, error_msg=None):
"""Get required API key from dict-like object.

:param dict request_data:
:param dict api_params:
dictionary with names and values of parameters supplied to API.
:param str name:
required key that must be found in request_data
required key that must be found in api_params
:return: value of required param
:raises Error: Bad Request response

"""
if error_msg is None:
error_msg = 'Missing required parameter: {}'.format(repr(name))
try:
value = request_data[name]
value = api_params[name]
except KeyError:
raise Error('400', message=error_msg)
if not value:
raise Error('400', message=error_msg)
return value

def prepare_crawl(self, request_data, spider_data, *args, **kwargs):
def prepare_crawl(self, api_params, scrapy_request_args, *args, **kwargs):
"""Schedule given spider with CrawlManager.

:param dict request_data:
:param dict api_params:
arguments needed to find spider and set proper api parameters
for crawl (max_requests for example)

:param dict spider_data:
:param dict scrapy_request_args:
should contain positional and keyword arguments for Scrapy
Request object that will be created
"""
spider_name = self.get_required_argument(request_data, 'spider_name')
spider_name = self.get_required_argument(api_params, 'spider_name')
start_requests = api_params.get("start_requests", False)
try:
max_requests = request_data['max_requests']
max_requests = api_params['max_requests']
except (KeyError, IndexError):
max_requests = None
dfd = self.run_crawl(
spider_name, spider_data, max_requests, *args, **kwargs)
spider_name, scrapy_request_args, max_requests,
start_requests=start_requests, *args, **kwargs)
dfd.addCallback(
self.prepare_response, request_data=request_data, *args, **kwargs)
self.prepare_response, request_data=api_params, *args, **kwargs)
return dfd

def run_crawl(self, spider_name, spider_data,
max_requests=None, *args, **kwargs):
def run_crawl(self, spider_name, scrapy_request_args,
max_requests=None, start_requests=False, *args, **kwargs):
crawl_manager_cls = load_object(settings.CRAWL_MANAGER)
manager = crawl_manager_cls(spider_name, spider_data, max_requests)
manager = crawl_manager_cls(spider_name, scrapy_request_args, max_requests, start_requests=start_requests)
dfd = manager.crawl(*args, **kwargs)
return dfd

Expand All @@ -223,4 +233,3 @@ def prepare_response(self, result, *args, **kwargs):
if errors:
response["errors"] = errors
return response

21 changes: 21 additions & 0 deletions scrapyrt/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import inspect
from scrapy import Request


def extract_scrapy_request_args(dictionary, raise_error=False):
"""
:param dictionary: Dictionary with parameters passed to API
:param raise_error: raise ValueError if key is not valid arg for
scrapy.http.Request
:return: dictionary of valid scrapy.http.Request positional and keyword
arguments.
"""
result = dictionary.copy()
args = inspect.getargspec(Request.__init__).args
for key in dictionary.keys():
if key not in args:
result.pop(key)
if raise_error:
msg = u"{!r} is not a valid argument for scrapy.Request.__init__"
raise ValueError(msg.format(key))
return result
Copy link
Contributor

@chekunkov chekunkov Jun 23, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea~~, what about rewriting it slightly?~~ + even short docstring is better than no docstring at all

UPD: probably you should ignore this snippet, see comment below.

def extract_scrapy_request_args(dictionary, raise_error=False):
    result = {}
    args = inspect.getargspec(Request.__init__).args
    for key, value in dictionary.items():
        if key in args:
            result[key] = value
        elif raise_error:
            msg = u"{!r} is not a valid argument for scrapy.Request"
            raise ValueError(msg.format(key))
    return result

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... unless result.pop() was put in original version intentionally to split passed dictionary into spider data and request data. Please document this and let's keep version with .pop()

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added docstring and unit test for utility

2 changes: 2 additions & 0 deletions tests/sample_data/testproject/testproject/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@

class TestprojectItem(scrapy.Item):
name = scrapy.Field()
referer = scrapy.Field()
response = scrapy.Field()
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
import scrapy

from ..items import TestprojectItem


class TestSpider(scrapy.Spider):

name = 'test_with_sr'
initial_urls = ["{0}", "{1}"]

def start_requests(self):
for url in self.initial_urls:
yield scrapy.Request(url, callback=self.some_callback, meta=dict(referer=url))

def some_callback(self, response):
name = response.xpath('//h1/text()').extract()
return TestprojectItem(name=name, referer=response.meta["referer"])

def parse(self, response):
name = response.xpath("//h1/text()").extract()
return TestprojectItem(name=name, referer=response.meta.get("referer"),
response=response.url)
4 changes: 2 additions & 2 deletions tests/sample_data/testsite/page2.html
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
<html>

<head>
<title>Page 1</title>
<title>Page 2</title>
</head>

<body>

<h1>Page 1</h1>
<h1>Page 2</h1>

</body>
</html>
12 changes: 12 additions & 0 deletions tests/sample_data/testsite/page3.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<html>

<head>
<title>Page 3</title>
</head>

<body>

<h1>Page 3</h1>

</body>
</html>
13 changes: 12 additions & 1 deletion tests/servers.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,17 +103,28 @@ def _non_block_read(output):

class ScrapyrtTestServer(BaseTestServer):

def __init__(self, *args, **kwargs):
def __init__(self, site=None, *args, **kwargs):
super(ScrapyrtTestServer, self).__init__(*args, **kwargs)
self.arguments = [
sys.executable, '-m', 'scrapyrt.cmdline', '-p', str(self.port)
]
self.stderr = PIPE
self.tmp_dir = tempfile.mkdtemp()
self.cwd = os.path.join(self.tmp_dir, 'testproject')

source = os.path.join(SAMPLE_DATA, 'testproject')
shutil.copytree(
source, self.cwd, ignore=shutil.ignore_patterns('*.pyc'))
# Pass site url to spider doing start requests
spider_name = "testspider_startrequests.py"
spider_filename = os.path.join(self.cwd, "testproject", "spider_templates", spider_name)
spider_target_place = os.path.join(self.cwd, "testproject", "spiders", spider_name)
if not site:
return
with open(spider_filename) as spider_file:
spider_string = spider_file.read().format(site.url("page1.html"), site.url("page2.html"))
with open(spider_target_place, "wb") as file_target:
file_target.write(spider_string)

def stop(self):
super(ScrapyrtTestServer, self).stop()
Expand Down
Loading