From 32ef41b6ba36a6c4f870c624e52e494e72c59352 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 25 Mar 2016 14:53:07 +0100 Subject: [PATCH 01/17] skip failing test TODO investigate later --- tests/test_crawl_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_crawl_manager.py b/tests/test_crawl_manager.py index 2761fdb..d8b6554 100644 --- a/tests/test_crawl_manager.py +++ b/tests/test_crawl_manager.py @@ -2,6 +2,7 @@ from time import sleep import datetime +import pytest from mock import patch, MagicMock from scrapy import Item from scrapy.exceptions import DontCloseSpider @@ -169,6 +170,7 @@ def _test_limit_runtime(self): self.crawl_manager.limit_runtime(self.spider) self.assertTrue(self.crawler.engine.close_spider.called) + @pytest.skip("why is it failing?") def test_limit_runtime(self): self._test_limit_runtime() From 2d4da86d4e5c198ff7a405aa7a7325698d38cffc Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 25 Mar 2016 16:26:22 +0100 Subject: [PATCH 02/17] [crawl_resource] allow url if start_requests=True * add common logic of extracting api arguments in POST and GET * extract Scrapy Request arguments and parameters for API into separate variables * validate Scrapy Request arguments on resource level * update test so that they test POST and GET handlers --- scrapyrt/core.py | 12 ++--- scrapyrt/resources.py | 47 ++++++++++++------ scrapyrt/utils.py | 14 ++++++ tests/test_resource_crawl.py | 94 ++++++++++++++++++++++++++++++------ 4 files changed, 127 insertions(+), 40 deletions(-) create mode 100644 scrapyrt/utils.py diff --git a/scrapyrt/core.py b/scrapyrt/core.py index 1688175..906a5b9 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -156,6 +156,7 @@ def crawl(self, *args, **kwargs): dfd = self.crawler_process.crawl(self.spider_name, *args, **kwargs) except KeyError as e: # Spider not found. + # TODO is it the only possible exception here? raise Error('404', message=e.message) dfd.addCallback(self.return_items) return dfd @@ -263,17 +264,10 @@ def create_spider_request(self, kwargs): try: req = Request(url, **kwargs) except (TypeError, ValueError) as e: - # Bad arguments for scrapy Request - # we don't want to schedule spider if someone - # passes meaingless arguments to Request. - # We must raise this here so that this will be returned to client, - # Otherwise if this is raised in spider_idle it goes to - # spider logs where it does not really belong. - # It is needed because in POST handler we can pass - # all possible requests kwargs, so it is easy to make mistakes. - message = "Error while creating Request, {}".format(e.message) + message = "Error while creating Scrapy Request, {}".format(e.message) raise Error('400', message=message) + # why is it here? req.dont_filter = True msg = u"Created request for spider {} with url {} and kwargs {}" msg = msg.format(self.spider_name, url, repr(kwargs)) diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index 65e4493..b69ecad 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -1,12 +1,14 @@ # -*- coding: utf-8 -*- +import demjson + from scrapy.utils.misc import load_object from scrapy.utils.serialize import ScrapyJSONEncoder from twisted.internet.defer import Deferred from twisted.python.failure import Failure from twisted.web import server, resource from twisted.web.error import UnsupportedMethod, Error -import demjson +from scrapyrt.utils import extract_scrapy_request_args from . import log from .conf import settings @@ -103,6 +105,11 @@ class CrawlResource(ServiceResource): isLeaf = True allowedMethods = ['GET', 'POST'] + api_parameters = { + "spider_name", + "max_requests", + "start_requests" + } def render_GET(self, request, **kwargs): """Request querysting must contain following keys: url, spider_name. @@ -114,21 +121,12 @@ def render_GET(self, request, **kwargs): (name.decode('utf-8'), value[0].decode('utf-8')) for name, value in request.args.items() ) - - spider_data = { - 'url': self.get_required_argument(request_data, 'url'), - # TODO get optional Request arguments here - # distinguish between proper Request args and - # api parameters - } - try: - callback = request_data['callback'] - except KeyError: - pass - else: - spider_data['callback'] = callback + spider_data = extract_scrapy_request_args(request_data, raise_error=False) + request_data = self._get_api_params(request_data) + self.validate_options(spider_data, request_data) return self.prepare_crawl(request_data, spider_data, **kwargs) + def render_POST(self, request, **kwargs): """ :param request: @@ -155,11 +153,28 @@ def render_POST(self, request, **kwargs): log.msg("{}".format(request_data)) spider_data = self.get_required_argument(request_data, "request") - error_msg = "Missing required key 'url' in 'request' object" - self.get_required_argument(spider_data, "url", error_msg=error_msg) + try: + spider_data = extract_scrapy_request_args(spider_data, raise_error=True) + except ValueError as e: + raise Error(400, e.message) + request_data = self._get_api_params(request_data) + self.validate_options(spider_data, request_data) return self.prepare_crawl(request_data, spider_data, **kwargs) + def _get_api_params(self, request_data): + api_params = {} + for k, v in request_data.items(): + if any(k == p for p in self.api_parameters): + api_params[k] = v + return api_params + + def validate_options(self, spider_data, api_params): + url = spider_data.get("url") + start_requests = api_params.get("start_requests") + if not url and not start_requests: + raise Error(400, "'url' is required if start_requests are disabled") + def get_required_argument(self, request_data, name, error_msg=None): """Get required API key from dict-like object. diff --git a/scrapyrt/utils.py b/scrapyrt/utils.py new file mode 100644 index 0000000..5c94adc --- /dev/null +++ b/scrapyrt/utils.py @@ -0,0 +1,14 @@ +import inspect +from scrapy import Request + + +def extract_scrapy_request_args(dictionary, raise_error=False): + result = dictionary.copy() + args = inspect.getargspec(Request.__init__).args + for key in dictionary.keys(): + if key not in args: + result.pop(key) + if raise_error: + msg = u"{!r} is not a valid argument for scrapy.Request.__init__" + raise ValueError(msg.format(key)) + return result diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index 1258cd3..80714a7 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -54,35 +54,83 @@ def tearDown(self): self.server.stop() self.site.stop() + def get_and_post(self, url, api_params, spider_data): + get_params = api_params.copy() + get_params.update(spider_data) + res_get = requests.get( + url, params=get_params + ) + post_data = { + "request": spider_data + } + post_data.update(api_params) + res_post = requests.post( + url, + json=post_data + ) + return res_get, res_post + def test_no_parameters(self): - res = requests.get(self.crawl_url) - assert res.status_code == 400 - res_json = res.json() - expected_result = {u'status': u'error', u'code': 400} - self.assertDictContainsSubset(expected_result, res_json) - assert 'url' in res_json['message'] + r1, r2 = self.get_and_post(self.crawl_url, {}, {}) + for res in (r1, r2): + assert res.status_code == 400 + res_json = res.json() + expected_result = {u'status': u'error', u'code': 400} + self.assertDictContainsSubset(expected_result, res_json) + if res.request.method == "GET": + assert 'url' in res_json['message'] + else: + assert "request" in res_json["message"] + + def test_no_url_no_start_requests(self): + r1, r2 = self.get_and_post(self.crawl_url, {'spider_name': self.spider_name}, {}) + for res in (r1, r2): + assert res.status_code == 400 + expected_result = { + u'status': u'error', + u'code': 400 + } + res_json = res.json() + self.assertDictContainsSubset(expected_result, res_json) + if res.request.method == "GET": + assert 'url' in res_json['message'] + else: + assert "request" in res_json["message"] + + def test_no_url_but_start_requests_present(self): + r1, r2 = self.get_and_post(self.crawl_url, { + 'spider_name': self.spider_name, + "start_requests": True + }, {}) + for res in (r1, r2): + assert res.status_code == 200 + assert res.json().get("status") == "ok" - def test_no_url(self): + def test_no_spider_name(self): res = requests.get( self.crawl_url, params={ - 'spider_name': self.spider_name + 'url': self.site_url, } ) assert res.status_code == 400 + res_json = res.json() expected_result = { u'status': u'error', u'code': 400 } - res_json = res.json() self.assertDictContainsSubset(expected_result, res_json) - assert 'url' in res_json['message'] + assert 'spider_name' in res_json['message'] - def test_no_spider_name(self): - res = requests.get( + def test_invalid_scrapy_request_detected_in_api(self): + res = requests.post( self.crawl_url, - params={ - 'url': self.site_url, + json={ + "request": { + 'url': self.site_url, + "not_an_argument": False + }, + "spider_name": self.spider_name, } ) assert res.status_code == 400 @@ -92,7 +140,23 @@ def test_no_spider_name(self): u'code': 400 } self.assertDictContainsSubset(expected_result, res_json) - assert 'spider_name' in res_json['message'] + assert "'not_an_argument' is not a valid argument" in res_json['message'] + + def test_invalid_scrapy_request_detected_by_scrapy(self): + r1, r2 = self.get_and_post( + self.crawl_url, + {"spider_name": self.spider_name}, + {'url': "no_rules"} + ) + for res in (r1, r2): + assert res.status_code == 400 + res_json = res.json() + expected_result = { + u'status': u'error', + u'code': 400 + } + self.assertDictContainsSubset(expected_result, res_json) + assert "Error while creating Scrapy Request" in res_json['message'] def test_crawl(self): res = requests.get( From bbe85f303571f10deb809e15df45c34fe74f0b5d Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 25 Mar 2016 17:30:07 +0100 Subject: [PATCH 03/17] add start_requests support --- scrapyrt/core.py | 11 ++++++---- scrapyrt/resources.py | 13 +++++------ tests/test_resource_crawl.py | 42 +++++++++++++++++------------------- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/scrapyrt/core.py b/scrapyrt/core.py index 906a5b9..4114eb1 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -130,7 +130,7 @@ class CrawlManager(object): Runs crawls """ - def __init__(self, spider_name, request_kwargs, max_requests=None): + def __init__(self, spider_name, request_kwargs, max_requests=None, start_requests=False): self.spider_name = spider_name self.log_dir = settings.LOG_DIR self.items = [] @@ -145,8 +145,11 @@ def __init__(self, spider_name, request_kwargs, max_requests=None): # callback will be added after instantiation of crawler object # because we need to know if spider has method available self.callback_name = request_kwargs.pop('callback', None) or 'parse' - self.request = self.create_spider_request(deepcopy(request_kwargs)) - self.start_requests = False + if request_kwargs.get("url"): + self.request = self.create_spider_request(deepcopy(request_kwargs)) + else: + self.request = None + self.start_requests = start_requests self._request_scheduled = False def crawl(self, *args, **kwargs): @@ -190,7 +193,7 @@ def spider_idle(self, spider): which is totally wrong. """ - if spider is self.crawler.spider and not self._request_scheduled: + if spider is self.crawler.spider and self.request and not self._request_scheduled: callback = getattr(self.crawler.spider, self.callback_name) assert callable(callback), 'Invalid callback' self.request = self.request.replace(callback=callback) diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index b69ecad..e7689e0 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -126,7 +126,6 @@ def render_GET(self, request, **kwargs): self.validate_options(spider_data, request_data) return self.prepare_crawl(request_data, spider_data, **kwargs) - def render_POST(self, request, **kwargs): """ :param request: @@ -152,12 +151,13 @@ def render_POST(self, request, **kwargs): raise Error('400', message=message) log.msg("{}".format(request_data)) - spider_data = self.get_required_argument(request_data, "request") try: - spider_data = extract_scrapy_request_args(spider_data, raise_error=True) + spider_data = extract_scrapy_request_args(request_data.get("request", {}), raise_error=True) except ValueError as e: raise Error(400, e.message) + if not request_data.get("start_requests"): + self.get_required_argument(request_data, "request") request_data = self._get_api_params(request_data) self.validate_options(spider_data, request_data) return self.prepare_crawl(request_data, spider_data, **kwargs) @@ -208,20 +208,21 @@ def prepare_crawl(self, request_data, spider_data, *args, **kwargs): Request object that will be created """ spider_name = self.get_required_argument(request_data, 'spider_name') + start_requests = request_data.get("start_requests", False) try: max_requests = request_data['max_requests'] except (KeyError, IndexError): max_requests = None dfd = self.run_crawl( - spider_name, spider_data, max_requests, *args, **kwargs) + spider_name, spider_data, max_requests, start_requests=start_requests, *args, **kwargs) dfd.addCallback( self.prepare_response, request_data=request_data, *args, **kwargs) return dfd def run_crawl(self, spider_name, spider_data, - max_requests=None, *args, **kwargs): + max_requests=None, start_requests=False, *args, **kwargs): crawl_manager_cls = load_object(settings.CRAWL_MANAGER) - manager = crawl_manager_cls(spider_name, spider_data, max_requests) + manager = crawl_manager_cls(spider_name, spider_data, max_requests, start_requests=start_requests) dfd = manager.crawl(*args, **kwargs) return dfd diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index 80714a7..83d5d69 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -101,10 +101,14 @@ def test_no_url_but_start_requests_present(self): r1, r2 = self.get_and_post(self.crawl_url, { 'spider_name': self.spider_name, "start_requests": True - }, {}) + }, {"url": self.site_url}) for res in (r1, r2): assert res.status_code == 200 - assert res.json().get("status") == "ok" + result = res.json() + assert result.get("status") == "ok" + assert result.get("stats") is not None + assert result["stats"].get("downloader/request_count") == 1 + assert len(result.get("items", [])) == 3 def test_no_spider_name(self): res = requests.get( @@ -159,24 +163,18 @@ def test_invalid_scrapy_request_detected_by_scrapy(self): assert "Error while creating Scrapy Request" in res_json['message'] def test_crawl(self): - res = requests.get( - self.crawl_url, - params={ - 'url': self.site_url, - 'spider_name': self.spider_name + r1, r2 = self.get_and_post(self.crawl_url, {"spider_name": self.spider_name}, {"url": self.site_url}) + for res in (r1, r2): + expected_result = { + u'status': u'ok', + u'items_dropped': [] } - ) - - expected_result = { - u'status': u'ok', - u'items_dropped': [] - } - expected_items = [{ - u'name': ['Page 1'], - }] - res_json = res.json() - self.assertDictContainsSubset(expected_result, res_json) - assert res_json['items'] - assert len(res_json['items']) == len(expected_items) - for exp_item, res_item in zip(expected_items, res_json['items']): - self.assertDictContainsSubset(exp_item, res_item) + expected_items = [{ + u'name': ['Page 1'], + }] + res_json = res.json() + self.assertDictContainsSubset(expected_result, res_json) + assert res_json['items'] + assert len(res_json['items']) == len(expected_items) + for exp_item, res_item in zip(expected_items, res_json['items']): + self.assertDictContainsSubset(exp_item, res_item) From a8c2b83af4853798590d74d5238fdf183f0215d6 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Mon, 30 May 2016 16:31:53 +0200 Subject: [PATCH 04/17] add test for spider with start_requests --- .../testproject/testproject/items.py | 1 + .../testspider_startrequests.py | 18 ++++++++++++++++ tests/sample_data/testsite/page2.html | 4 ++-- tests/servers.py | 13 +++++++++++- tests/test_resource_crawl.py | 21 ++++++++++++------- tests/test_resource_root.py | 6 +++--- 6 files changed, 50 insertions(+), 13 deletions(-) create mode 100644 tests/sample_data/testproject/testproject/spider_templates/testspider_startrequests.py diff --git a/tests/sample_data/testproject/testproject/items.py b/tests/sample_data/testproject/testproject/items.py index abc27d9..f20fbb4 100644 --- a/tests/sample_data/testproject/testproject/items.py +++ b/tests/sample_data/testproject/testproject/items.py @@ -4,3 +4,4 @@ class TestprojectItem(scrapy.Item): name = scrapy.Field() + referer = scrapy.Field() diff --git a/tests/sample_data/testproject/testproject/spider_templates/testspider_startrequests.py b/tests/sample_data/testproject/testproject/spider_templates/testspider_startrequests.py new file mode 100644 index 0000000..260458a --- /dev/null +++ b/tests/sample_data/testproject/testproject/spider_templates/testspider_startrequests.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +import scrapy + +from ..items import TestprojectItem + + +class TestSpider(scrapy.Spider): + + name = 'test_with_sr' + initial_urls = ["{0}", "{1}"] + + def start_requests(self): + for url in self.initial_urls: + yield scrapy.Request(url, callback=self.some_callback, meta=dict(referer=url)) + + def some_callback(self, response): + name = response.xpath('//h1/text()').extract() + return TestprojectItem(name=name, referer=response.meta["referer"]) \ No newline at end of file diff --git a/tests/sample_data/testsite/page2.html b/tests/sample_data/testsite/page2.html index 1e8a2c0..209baaa 100644 --- a/tests/sample_data/testsite/page2.html +++ b/tests/sample_data/testsite/page2.html @@ -1,12 +1,12 @@ - Page 1 + Page 2 -

Page 1

+

Page 2

diff --git a/tests/servers.py b/tests/servers.py index 03d6f0f..cd4a458 100644 --- a/tests/servers.py +++ b/tests/servers.py @@ -103,7 +103,7 @@ def _non_block_read(output): class ScrapyrtTestServer(BaseTestServer): - def __init__(self, *args, **kwargs): + def __init__(self, site=None, *args, **kwargs): super(ScrapyrtTestServer, self).__init__(*args, **kwargs) self.arguments = [ sys.executable, '-m', 'scrapyrt.cmdline', '-p', str(self.port) @@ -111,9 +111,20 @@ def __init__(self, *args, **kwargs): self.stderr = PIPE self.tmp_dir = tempfile.mkdtemp() self.cwd = os.path.join(self.tmp_dir, 'testproject') + source = os.path.join(SAMPLE_DATA, 'testproject') shutil.copytree( source, self.cwd, ignore=shutil.ignore_patterns('*.pyc')) + # Pass site url to spider doing start requests + spider_name = "testspider_startrequests.py" + spider_filename = os.path.join(self.cwd, "testproject", "spider_templates", spider_name) + spider_target_place = os.path.join(self.cwd, "testproject", "spiders", spider_name) + if not site: + return + with open(spider_filename) as spider_file: + spider_string = spider_file.read().format(site.url("page1.html"), site.url("page2.html")) + with open(spider_target_place, "wb") as file_target: + file_target.write(spider_string) def stop(self): super(ScrapyrtTestServer, self).stop() diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index 83d5d69..44b9397 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -40,13 +40,13 @@ def test_empty_argument(self): class TestCrawlResourceIntegration(unittest.TestCase): def setUp(self): - self.server = ScrapyrtTestServer() - self.server.start() - self.crawl_url = self.server.url('crawl.json') self.site = MockServer() self.site.start() self.site_url = self.site.url('page1.html') self.spider_name = 'test' + self.server = ScrapyrtTestServer(site=self.site) + self.server.start() + self.crawl_url = self.server.url('crawl.json') def tearDown(self): if not self._passed: @@ -99,16 +99,23 @@ def test_no_url_no_start_requests(self): def test_no_url_but_start_requests_present(self): r1, r2 = self.get_and_post(self.crawl_url, { - 'spider_name': self.spider_name, + 'spider_name': "test_with_sr", "start_requests": True - }, {"url": self.site_url}) + }, {}) for res in (r1, r2): assert res.status_code == 200 result = res.json() assert result.get("status") == "ok" assert result.get("stats") is not None - assert result["stats"].get("downloader/request_count") == 1 - assert len(result.get("items", [])) == 3 + assert len(result.get("items", [])) == 2 + items = result["items"] + assert items[0]["name"][0] == u"Page 1" + assert items[1]["name"][0] == u"Page 2" + assert "page1" in items[0]["referer"] + assert "page2" in items[1]["referer"] + spider_errors = result.get("errors", []) + assert len(spider_errors) == 0 + assert result["stats"].get("downloader/request_count") == 2 def test_no_spider_name(self): res = requests.get( diff --git a/tests/test_resource_root.py b/tests/test_resource_root.py index d56c4d0..9853713 100644 --- a/tests/test_resource_root.py +++ b/tests/test_resource_root.py @@ -8,11 +8,11 @@ class TestRootResourceIntegration(unittest.TestCase): def setUp(self): - self.server = ScrapyrtTestServer() - self.server.start() - self.root_url = self.server.url() self.site = MockServer() self.site.start() + self.server = ScrapyrtTestServer(site=self.site) + self.server.start() + self.root_url = self.server.url() def tearDown(self): if not self._passed: From a1fa6fda8761fa0b1012d728f32c5d744a2935eb Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Thu, 22 Dec 2016 12:24:49 +0100 Subject: [PATCH 05/17] [resources] refactor, rename important variable names request_data will be changed to api_params; spider_data will be changed to scrapy_request_args --- scrapyrt/resources.py | 59 +++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index e7689e0..80e4bbb 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -117,14 +117,15 @@ def render_GET(self, request, **kwargs): At the moment kwargs for scrapy request are not supported in GET. They are supported in POST handler. """ - request_data = dict( + api_params = dict( (name.decode('utf-8'), value[0].decode('utf-8')) for name, value in request.args.items() ) - spider_data = extract_scrapy_request_args(request_data, raise_error=False) - request_data = self._get_api_params(request_data) - self.validate_options(spider_data, request_data) - return self.prepare_crawl(request_data, spider_data, **kwargs) + scrapy_request_args = extract_scrapy_request_args(api_params, + raise_error=False) + api_params = self._get_api_params(api_params) + self.validate_options(scrapy_request_args, api_params) + return self.prepare_crawl(api_params, scrapy_request_args, **kwargs) def render_POST(self, request, **kwargs): """ @@ -144,23 +145,25 @@ def render_POST(self, request, **kwargs): """ request_body = request.content.getvalue() try: - request_data = demjson.decode(request_body) + api_params = demjson.decode(request_body) except ValueError as e: message = "Invalid JSON in POST body. {}" message.format(e.pretty_description()) raise Error('400', message=message) - log.msg("{}".format(request_data)) + log.msg("{}".format(api_params)) try: - spider_data = extract_scrapy_request_args(request_data.get("request", {}), raise_error=True) + scrapy_request_args = extract_scrapy_request_args( + api_params.get("request", {}), raise_error=True + ) except ValueError as e: raise Error(400, e.message) - if not request_data.get("start_requests"): - self.get_required_argument(request_data, "request") - request_data = self._get_api_params(request_data) - self.validate_options(spider_data, request_data) - return self.prepare_crawl(request_data, spider_data, **kwargs) + if not api_params.get("start_requests"): + self.get_required_argument(api_params, "request") + api_params = self._get_api_params(api_params) + self.validate_options(scrapy_request_args, api_params) + return self.prepare_crawl(api_params, scrapy_request_args, **kwargs) def _get_api_params(self, request_data): api_params = {} @@ -169,19 +172,20 @@ def _get_api_params(self, request_data): api_params[k] = v return api_params - def validate_options(self, spider_data, api_params): - url = spider_data.get("url") + def validate_options(self, scrapy_request_args, api_params): + url = scrapy_request_args.get("url") start_requests = api_params.get("start_requests") if not url and not start_requests: - raise Error(400, "'url' is required if start_requests are disabled") + raise Error(400, + "'url' is required if start_requests are disabled") - def get_required_argument(self, request_data, name, error_msg=None): + def get_required_argument(self, api_params, name, error_msg=None): """Get required API key from dict-like object. - :param dict request_data: + :param dict api_params: dictionary with names and values of parameters supplied to API. :param str name: - required key that must be found in request_data + required key that must be found in api_params :return: value of required param :raises Error: Bad Request response @@ -189,17 +193,17 @@ def get_required_argument(self, request_data, name, error_msg=None): if error_msg is None: error_msg = 'Missing required parameter: {}'.format(repr(name)) try: - value = request_data[name] + value = api_params[name] except KeyError: raise Error('400', message=error_msg) if not value: raise Error('400', message=error_msg) return value - def prepare_crawl(self, request_data, spider_data, *args, **kwargs): + def prepare_crawl(self, api_params, spider_data, *args, **kwargs): """Schedule given spider with CrawlManager. - :param dict request_data: + :param dict api_params: arguments needed to find spider and set proper api parameters for crawl (max_requests for example) @@ -207,16 +211,17 @@ def prepare_crawl(self, request_data, spider_data, *args, **kwargs): should contain positional and keyword arguments for Scrapy Request object that will be created """ - spider_name = self.get_required_argument(request_data, 'spider_name') - start_requests = request_data.get("start_requests", False) + spider_name = self.get_required_argument(api_params, 'spider_name') + start_requests = api_params.get("start_requests", False) try: - max_requests = request_data['max_requests'] + max_requests = api_params['max_requests'] except (KeyError, IndexError): max_requests = None dfd = self.run_crawl( - spider_name, spider_data, max_requests, start_requests=start_requests, *args, **kwargs) + spider_name, spider_data, max_requests, + start_requests=start_requests, *args, **kwargs) dfd.addCallback( - self.prepare_response, request_data=request_data, *args, **kwargs) + self.prepare_response, request_data=api_params, *args, **kwargs) return dfd def run_crawl(self, spider_name, spider_data, From 50f496e471950674755bd29e0ba4bc6ec1d49a40 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Thu, 22 Dec 2016 12:28:06 +0100 Subject: [PATCH 06/17] [style] sort imports in resources --- scrapyrt/core.py | 2 -- scrapyrt/resources.py | 8 +++----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/scrapyrt/core.py b/scrapyrt/core.py index 4114eb1..ee60a7c 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -159,7 +159,6 @@ def crawl(self, *args, **kwargs): dfd = self.crawler_process.crawl(self.spider_name, *args, **kwargs) except KeyError as e: # Spider not found. - # TODO is it the only possible exception here? raise Error('404', message=e.message) dfd.addCallback(self.return_items) return dfd @@ -270,7 +269,6 @@ def create_spider_request(self, kwargs): message = "Error while creating Scrapy Request, {}".format(e.message) raise Error('400', message=message) - # why is it here? req.dont_filter = True msg = u"Created request for spider {} with url {} and kwargs {}" msg = msg.format(self.spider_name, url, repr(kwargs)) diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index 80e4bbb..48a596a 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -1,16 +1,15 @@ # -*- coding: utf-8 -*- import demjson - from scrapy.utils.misc import load_object from scrapy.utils.serialize import ScrapyJSONEncoder from twisted.internet.defer import Deferred from twisted.python.failure import Failure -from twisted.web import server, resource -from twisted.web.error import UnsupportedMethod, Error +from twisted.web import resource, server +from twisted.web.error import Error, UnsupportedMethod -from scrapyrt.utils import extract_scrapy_request_args from . import log from .conf import settings +from .utils import extract_scrapy_request_args class ServiceResource(resource.Resource, object): @@ -244,4 +243,3 @@ def prepare_response(self, result, *args, **kwargs): if errors: response["errors"] = errors return response - From 3b901038b67c1f484b139b7ab629b93e48ee54a1 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Thu, 22 Dec 2016 12:43:39 +0100 Subject: [PATCH 07/17] [requirements-dev] update test packages to versions that are known to work --- requirements-dev.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index d8f7f0a..e9dea3c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,8 +1,8 @@ -r requirements.txt fabric -requests -mock -pytest -pytest-cov -port-for +requests==2.9.1 +mock==1.3.0 +pytest==2.9.1 +pytest-cov==2.2.1 +port-for==0.3.1 From 0da27f2749b1d16ad712f0f41080d0d0f1d19ba6 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Thu, 22 Dec 2016 12:59:24 +0100 Subject: [PATCH 08/17] [utils] add tests for extract_scrapy_request_args and also add docstring for that --- scrapyrt/utils.py | 7 +++++++ tests/test_utils.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 tests/test_utils.py diff --git a/scrapyrt/utils.py b/scrapyrt/utils.py index 5c94adc..2bafce0 100644 --- a/scrapyrt/utils.py +++ b/scrapyrt/utils.py @@ -3,6 +3,13 @@ def extract_scrapy_request_args(dictionary, raise_error=False): + """ + :param dictionary: Dictionary with parameters passed to API + :param raise_error: raise ValueError if key is not valid arg for + scrapy.httpRequest + :return: dictionary of valid scrapy.http.Request positional and keyword + arguments. + """ result = dictionary.copy() args = inspect.getargspec(Request.__init__).args for key in dictionary.keys(): diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..893dc98 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,34 @@ +import re + +import pytest + +from scrapyrt.utils import extract_scrapy_request_args + + +class TestUtils(object): + def test_get_scrapy_request_args(self): + args = { + "url": "http://foo.com", + "callback": "parse", + "noise": True + } + + result = extract_scrapy_request_args(args) + + assert result["url"] == "http://foo.com" + assert result["callback"] == "parse" + assert "noise" not in result + + def test_get_scrapy_request_args_error(self): + args = { + "url": "http://foo.com", + "callback": "parse", + "noise": True + } + + with pytest.raises(ValueError) as e: + extract_scrapy_request_args(args, raise_error=True) + + expected_msg =u"'noise' is not a valid argument for scrapy.Request" + assert re.search(expected_msg, e.value.message) + From abb4a2a85a806ccdb93edaa2a7051d2c0e73522a Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Thu, 22 Dec 2016 13:02:56 +0100 Subject: [PATCH 09/17] [crawler_manager_test] unskip limit runtime test --- tests/test_crawl_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_crawl_manager.py b/tests/test_crawl_manager.py index d8b6554..f57e2ad 100644 --- a/tests/test_crawl_manager.py +++ b/tests/test_crawl_manager.py @@ -161,7 +161,7 @@ class TestLimitRuntime(TestCrawlManager): def setUp(self): super(TestLimitRuntime, self).setUp() self.crawl_manager.timeout_limit = 1 - self.crawler.stats.get_value.return_value = datetime.datetime.now() + self.crawler.stats.get_value.return_value = datetime.datetime.utcnow() def _test_limit_runtime(self): self.crawl_manager.limit_runtime(self.spider) @@ -170,7 +170,6 @@ def _test_limit_runtime(self): self.crawl_manager.limit_runtime(self.spider) self.assertTrue(self.crawler.engine.close_spider.called) - @pytest.skip("why is it failing?") def test_limit_runtime(self): self._test_limit_runtime() From 5454bf362743800c9778146bee54afac73f13d72 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Thu, 22 Dec 2016 13:06:15 +0100 Subject: [PATCH 10/17] [resources] more renaming of spider_Data to scrapy_request_args --- scrapyrt/resources.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index 48a596a..0b43fd7 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -199,14 +199,14 @@ def get_required_argument(self, api_params, name, error_msg=None): raise Error('400', message=error_msg) return value - def prepare_crawl(self, api_params, spider_data, *args, **kwargs): + def prepare_crawl(self, api_params, scrapy_request_args, *args, **kwargs): """Schedule given spider with CrawlManager. :param dict api_params: arguments needed to find spider and set proper api parameters for crawl (max_requests for example) - :param dict spider_data: + :param dict scrapy_request_args: should contain positional and keyword arguments for Scrapy Request object that will be created """ @@ -217,16 +217,16 @@ def prepare_crawl(self, api_params, spider_data, *args, **kwargs): except (KeyError, IndexError): max_requests = None dfd = self.run_crawl( - spider_name, spider_data, max_requests, + spider_name, scrapy_request_args, max_requests, start_requests=start_requests, *args, **kwargs) dfd.addCallback( self.prepare_response, request_data=api_params, *args, **kwargs) return dfd - def run_crawl(self, spider_name, spider_data, + def run_crawl(self, spider_name, scrapy_request_args, max_requests=None, start_requests=False, *args, **kwargs): crawl_manager_cls = load_object(settings.CRAWL_MANAGER) - manager = crawl_manager_cls(spider_name, spider_data, max_requests, start_requests=start_requests) + manager = crawl_manager_cls(spider_name, scrapy_request_args, max_requests, start_requests=start_requests) dfd = manager.crawl(*args, **kwargs) return dfd From 9c2f456b8e2c3d8ae56a98318a736078122bad6c Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Thu, 22 Dec 2016 14:18:52 +0100 Subject: [PATCH 11/17] [tests] parametrize resource tests --- tests/test_resource_crawl.py | 255 ++++++++++++++++++----------------- 1 file changed, 128 insertions(+), 127 deletions(-) diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index 44b9397..822c121 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import pytest from twisted.trial import unittest from twisted.web.error import Error import requests @@ -8,6 +9,22 @@ from .servers import ScrapyrtTestServer, MockServer +@pytest.fixture() +def server(request): + target_site = MockServer() + target_site.start() + server = ScrapyrtTestServer(site=target_site) + + def close(): + server.stop() + target_site.stop() + + request.addfinalizer(close) + server.target_site = target_site + server.start() + return server + + class TestCrawlResource(unittest.TestCase): def test_is_leaf(self): @@ -37,151 +54,135 @@ def test_empty_argument(self): self.assertEqual(exception.status, '400') -class TestCrawlResourceIntegration(unittest.TestCase): +def perform_get(url, api_params, spider_data): + api_params.update(spider_data) + return requests.get(url, params=api_params) - def setUp(self): - self.site = MockServer() - self.site.start() - self.site_url = self.site.url('page1.html') - self.spider_name = 'test' - self.server = ScrapyrtTestServer(site=self.site) - self.server.start() - self.crawl_url = self.server.url('crawl.json') - - def tearDown(self): - if not self._passed: - print self.server._non_block_read(self.server.proc.stderr) - self.server.stop() - self.site.stop() - - def get_and_post(self, url, api_params, spider_data): - get_params = api_params.copy() - get_params.update(spider_data) - res_get = requests.get( - url, params=get_params - ) - post_data = { - "request": spider_data + +def perform_post(url, api_params, spider_data): + post_data = {"request": spider_data} + post_data.update(api_params) + return requests.post(url, json=post_data) + + +class TestCrawlResourceIntegration(object): + @pytest.mark.parametrize("method", [ + perform_get, perform_post + ]) + def test_no_parameters(self, server, method): + res = method(server.url('crawl.json'), {}, {}) + assert res.status_code == 400 + res_json = res.json() + expected_result = {u'status': u'error', u'code': 400} + for key, value in expected_result.items(): + assert res_json.get(key) == value + if res.request.method == "GET": + assert 'url' in res_json['message'] + else: + assert "request" in res_json["message"] + + @pytest.mark.parametrize("method", [ + perform_get, perform_post + ]) + def test_no_url_no_start_requests(self, server, method): + res = method(server.url('crawl.json'), {'spider_name': 'test'}, + {}) + assert res.status_code == 400 + expected_result = { + u'status': u'error', + u'code': 400 } - post_data.update(api_params) - res_post = requests.post( - url, - json=post_data - ) - return res_get, res_post - - def test_no_parameters(self): - r1, r2 = self.get_and_post(self.crawl_url, {}, {}) - for res in (r1, r2): - assert res.status_code == 400 - res_json = res.json() - expected_result = {u'status': u'error', u'code': 400} - self.assertDictContainsSubset(expected_result, res_json) - if res.request.method == "GET": - assert 'url' in res_json['message'] - else: - assert "request" in res_json["message"] - - def test_no_url_no_start_requests(self): - r1, r2 = self.get_and_post(self.crawl_url, {'spider_name': self.spider_name}, {}) - for res in (r1, r2): - assert res.status_code == 400 - expected_result = { - u'status': u'error', - u'code': 400 - } - res_json = res.json() - self.assertDictContainsSubset(expected_result, res_json) - if res.request.method == "GET": - assert 'url' in res_json['message'] - else: - assert "request" in res_json["message"] - - def test_no_url_but_start_requests_present(self): - r1, r2 = self.get_and_post(self.crawl_url, { + res_json = res.json() + for key, value in expected_result.items(): + assert res_json[key] == value + if res.request.method == "GET": + assert 'url' in res_json['message'] + else: + assert "request" in res_json["message"] + + @pytest.mark.parametrize("method", [ + perform_get, perform_post + ]) + def test_no_url_but_start_requests_present(self, server, method): + res = method(server.url("crawl.json"), { 'spider_name': "test_with_sr", "start_requests": True }, {}) - for res in (r1, r2): - assert res.status_code == 200 - result = res.json() - assert result.get("status") == "ok" - assert result.get("stats") is not None - assert len(result.get("items", [])) == 2 - items = result["items"] - assert items[0]["name"][0] == u"Page 1" - assert items[1]["name"][0] == u"Page 2" - assert "page1" in items[0]["referer"] - assert "page2" in items[1]["referer"] - spider_errors = result.get("errors", []) - assert len(spider_errors) == 0 - assert result["stats"].get("downloader/request_count") == 2 - - def test_no_spider_name(self): - res = requests.get( - self.crawl_url, - params={ - 'url': self.site_url, - } - ) + assert res.status_code == 200 + result = res.json() + assert result.get("status") == "ok" + assert result.get("stats") is not None + assert len(result.get("items", [])) == 2 + items = result["items"] + assert items[0]["name"][0] == u"Page 1" + assert items[1]["name"][0] == u"Page 2" + assert "page1" in items[0]["referer"] + assert "page2" in items[1]["referer"] + spider_errors = result.get("errors", []) + assert len(spider_errors) == 0 + assert result["stats"].get("downloader/request_count") == 2 + + @pytest.mark.parametrize("method", [ + perform_get, perform_post + ]) + def test_no_spider_name(self, server, method): + res = method(server.url("crawl.json"), + {}, + {"url": server.target_site.url("page1.html")}) assert res.status_code == 400 res_json = res.json() expected_result = { u'status': u'error', u'code': 400 } - self.assertDictContainsSubset(expected_result, res_json) + for key, value in expected_result.items(): + assert res_json[key] == value assert 'spider_name' in res_json['message'] - def test_invalid_scrapy_request_detected_in_api(self): - res = requests.post( - self.crawl_url, - json={ - "request": { - 'url': self.site_url, - "not_an_argument": False - }, - "spider_name": self.spider_name, - } - ) + def test_invalid_scrapy_request_detected_in_api(self, server): + res = perform_post(server.url("crawl.json"), + {"spider_name": "test"}, + {'url': server.target_site.url("page1.html"), + "not_an_argument": False}) assert res.status_code == 400 res_json = res.json() expected_result = { u'status': u'error', u'code': 400 } - self.assertDictContainsSubset(expected_result, res_json) - assert "'not_an_argument' is not a valid argument" in res_json['message'] - - def test_invalid_scrapy_request_detected_by_scrapy(self): - r1, r2 = self.get_and_post( - self.crawl_url, - {"spider_name": self.spider_name}, + for k, v in expected_result.items(): + assert res_json[k] == v + assert "'not_an_argument' is not a valid arg" in res_json['message'] + + @pytest.mark.parametrize("method", [ + perform_get, perform_post + ]) + def test_invalid_scrapy_request_detected_by_scrapy(self, server, method): + res = method( + server.url("crawl.json"), + {"spider_name": "test"}, {'url': "no_rules"} ) - for res in (r1, r2): - assert res.status_code == 400 - res_json = res.json() - expected_result = { - u'status': u'error', - u'code': 400 - } - self.assertDictContainsSubset(expected_result, res_json) - assert "Error while creating Scrapy Request" in res_json['message'] - - def test_crawl(self): - r1, r2 = self.get_and_post(self.crawl_url, {"spider_name": self.spider_name}, {"url": self.site_url}) - for res in (r1, r2): - expected_result = { - u'status': u'ok', - u'items_dropped': [] - } - expected_items = [{ - u'name': ['Page 1'], - }] - res_json = res.json() - self.assertDictContainsSubset(expected_result, res_json) - assert res_json['items'] - assert len(res_json['items']) == len(expected_items) - for exp_item, res_item in zip(expected_items, res_json['items']): - self.assertDictContainsSubset(exp_item, res_item) + assert res.status_code == 400 + res_json = res.json() + assert res_json["status"] == "error" + assert res_json["code"] == 400 + assert "Error while creating Scrapy Request" in res_json['message'] + + @pytest.mark.parametrize("method", [ + perform_get, perform_post + ]) + def test_crawl(self, server, method): + url = server.url("crawl.json") + res = method(url, + {"spider_name": "test"}, + {"url": server.target_site.url("page1.html")}) + expected_items = [{ + u'name': ['Page 1'], + }] + res_json = res.json() + assert res_json["status"] == "ok" + assert res_json["items_dropped"] == [] + assert res_json['items'] + assert len(res_json['items']) == len(expected_items) + assert res_json["items"] == expected_items From a6ad46ac5c84bad10f7de9ca1c61f663751c15e2 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Wed, 4 Jan 2017 17:34:36 +0100 Subject: [PATCH 12/17] [resources] dont validate/register api parameters --- scrapyrt/resources.py | 14 -------------- scrapyrt/utils.py | 2 +- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index 0b43fd7..b1bd061 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -104,11 +104,6 @@ class CrawlResource(ServiceResource): isLeaf = True allowedMethods = ['GET', 'POST'] - api_parameters = { - "spider_name", - "max_requests", - "start_requests" - } def render_GET(self, request, **kwargs): """Request querysting must contain following keys: url, spider_name. @@ -122,7 +117,6 @@ def render_GET(self, request, **kwargs): ) scrapy_request_args = extract_scrapy_request_args(api_params, raise_error=False) - api_params = self._get_api_params(api_params) self.validate_options(scrapy_request_args, api_params) return self.prepare_crawl(api_params, scrapy_request_args, **kwargs) @@ -160,17 +154,9 @@ def render_POST(self, request, **kwargs): if not api_params.get("start_requests"): self.get_required_argument(api_params, "request") - api_params = self._get_api_params(api_params) self.validate_options(scrapy_request_args, api_params) return self.prepare_crawl(api_params, scrapy_request_args, **kwargs) - def _get_api_params(self, request_data): - api_params = {} - for k, v in request_data.items(): - if any(k == p for p in self.api_parameters): - api_params[k] = v - return api_params - def validate_options(self, scrapy_request_args, api_params): url = scrapy_request_args.get("url") start_requests = api_params.get("start_requests") diff --git a/scrapyrt/utils.py b/scrapyrt/utils.py index 2bafce0..2e4637a 100644 --- a/scrapyrt/utils.py +++ b/scrapyrt/utils.py @@ -6,7 +6,7 @@ def extract_scrapy_request_args(dictionary, raise_error=False): """ :param dictionary: Dictionary with parameters passed to API :param raise_error: raise ValueError if key is not valid arg for - scrapy.httpRequest + scrapy.http.Request :return: dictionary of valid scrapy.http.Request positional and keyword arguments. """ From b5ae65bac293f0f4fd3ede79ded66199a7b2da7a Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Wed, 4 Jan 2017 18:05:11 +0100 Subject: [PATCH 13/17] [docs] document start_requests parameter --- docs/source/api.rst | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 307baa6..04ea062 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -57,11 +57,19 @@ spider_name url - type: string - - required + - required if start_requests not enabled Absolute URL to send request to. URL should be urlencoded so that querystring from url will not interfere with api parameters. + By default API will crawl this url and won't execute any other requests. + Most importantly it will not execute ``start_requests`` and spider will + not visit urls defined in ``start_urls`` spider attribute. There will be + only one single request scheduled in API - request for resource identified + by url argument. + + If you want to execute request pass start_requests argument. + callback - type: string - optional @@ -73,13 +81,26 @@ max_requests - type: integer - optional - Maximal amount of requests spider can generate. E.g. if it is set to ``1`` + Maximum amount of requests spider can generate. E.g. if it is set to ``1`` spider will only schedule one single request, other requests generated by spider (for example in callback, following links in first response) will be ignored. If your spider generates many requests in callback and you don't want to wait forever for it to finish you should probably pass it. +start_requests + - type: boolean + - optional + + Whether spider should execute ``Scrapy.Spider.start_requests`` method. + ``start_requests`` are executed by default when you run Scrapy Spider + normally without ScrapyRT, but this method is NOT executed in API by + default. By default we assume that spider is expected to crawl ONLY url + provided in parameters without making any requests to ``start_urls`` + defined in ``Spider`` class. start_requests argument overrides this + behavior. If this argument is present API will execute start_requests + Spider method. + If required parameters are missing api will return 400 Bad Request with hopefully helpful error message. From 0c636216e00acb15e9a18946a0fc267bea912214 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 20 Jan 2017 17:29:14 +0100 Subject: [PATCH 14/17] [tests/test_resource_crawl] dont rely on item order in tests --- tests/test_resource_crawl.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index 822c121..01df344 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -114,10 +114,14 @@ def test_no_url_but_start_requests_present(self, server, method): assert result.get("stats") is not None assert len(result.get("items", [])) == 2 items = result["items"] - assert items[0]["name"][0] == u"Page 1" - assert items[1]["name"][0] == u"Page 2" - assert "page1" in items[0]["referer"] - assert "page2" in items[1]["referer"] + assert len(items) == 2 + for item in items: + name = item["name"][0] + if name == "Page 1": + assert "page1" in item["referer"] + elif name == "Page 2": + assert "page2" in item["referer"] + spider_errors = result.get("errors", []) assert len(spider_errors) == 0 assert result["stats"].get("downloader/request_count") == 2 From 79e3e477dd2b7d5a97fa8aac10ebe325a3229731 Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 20 Jan 2017 18:04:08 +0100 Subject: [PATCH 15/17] [tests] add test for scenario when start_requests and url present there was no test for this, should be one now --- .../testproject/testproject/items.py | 1 + .../testspider_startrequests.py | 7 ++++- tests/sample_data/testsite/page3.html | 12 ++++++++ tests/test_resource_crawl.py | 28 +++++++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 tests/sample_data/testsite/page3.html diff --git a/tests/sample_data/testproject/testproject/items.py b/tests/sample_data/testproject/testproject/items.py index f20fbb4..fbb75da 100644 --- a/tests/sample_data/testproject/testproject/items.py +++ b/tests/sample_data/testproject/testproject/items.py @@ -5,3 +5,4 @@ class TestprojectItem(scrapy.Item): name = scrapy.Field() referer = scrapy.Field() + response = scrapy.Field() diff --git a/tests/sample_data/testproject/testproject/spider_templates/testspider_startrequests.py b/tests/sample_data/testproject/testproject/spider_templates/testspider_startrequests.py index 260458a..bef84fb 100644 --- a/tests/sample_data/testproject/testproject/spider_templates/testspider_startrequests.py +++ b/tests/sample_data/testproject/testproject/spider_templates/testspider_startrequests.py @@ -15,4 +15,9 @@ def start_requests(self): def some_callback(self, response): name = response.xpath('//h1/text()').extract() - return TestprojectItem(name=name, referer=response.meta["referer"]) \ No newline at end of file + return TestprojectItem(name=name, referer=response.meta["referer"]) + + def parse(self, response): + name = response.xpath("//h1/text()").extract() + return TestprojectItem(name=name, referer=response.meta.get("referer"), + response=response.url) \ No newline at end of file diff --git a/tests/sample_data/testsite/page3.html b/tests/sample_data/testsite/page3.html new file mode 100644 index 0000000..6ae7b09 --- /dev/null +++ b/tests/sample_data/testsite/page3.html @@ -0,0 +1,12 @@ + + + + Page 3 + + + + +

Page 3

+ + + \ No newline at end of file diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index 01df344..3885ba0 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -126,6 +126,34 @@ def test_no_url_but_start_requests_present(self, server, method): assert len(spider_errors) == 0 assert result["stats"].get("downloader/request_count") == 2 + @pytest.mark.parametrize("method", [ + perform_get, perform_post + ]) + def test_url_and_start_requests_present(self, server, method): + spider_data = { + "url": server.target_site.url("page3.html") + } + api_params = { + "spider_name": "test_with_sr", + "start_requests": True, + } + res = method(server.url("crawl.json"), api_params, + spider_data) + assert res.status_code == 200 + output = res.json() + assert len(output.get("errors", [])) == 0 + items = output.get("items", []) + assert len(items) == 3 + + for item in items: + name = item["name"][0] + if name == "Page 1": + assert "page1" in item["referer"] + elif name == "Page 2": + assert "page2" in item["referer"] + elif name == "Page 3": + assert item.get("referer") is None + @pytest.mark.parametrize("method", [ perform_get, perform_post ]) From 59d0a0153ac041c7184da39ad139e4c0b88969eb Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 20 Jan 2017 18:25:54 +0100 Subject: [PATCH 16/17] [resources] readability in start_requests POST handler + test for missing "request" argument to POST endpoint --- scrapyrt/resources.py | 10 +++++++--- tests/test_resource_crawl.py | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/scrapyrt/resources.py b/scrapyrt/resources.py index b1bd061..60483fb 100644 --- a/scrapyrt/resources.py +++ b/scrapyrt/resources.py @@ -145,15 +145,19 @@ def render_POST(self, request, **kwargs): raise Error('400', message=message) log.msg("{}".format(api_params)) + if api_params.get("start_requests"): + # start requests passed so 'request' argument is optional + _request = api_params.get("request", {}) + else: + # no start_requests, 'request' is required + _request = self.get_required_argument(api_params, "request") try: scrapy_request_args = extract_scrapy_request_args( - api_params.get("request", {}), raise_error=True + _request, raise_error=True ) except ValueError as e: raise Error(400, e.message) - if not api_params.get("start_requests"): - self.get_required_argument(api_params, "request") self.validate_options(scrapy_request_args, api_params) return self.prepare_crawl(api_params, scrapy_request_args, **kwargs) diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index 3885ba0..fae90e1 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -126,6 +126,25 @@ def test_no_url_but_start_requests_present(self, server, method): assert len(spider_errors) == 0 assert result["stats"].get("downloader/request_count") == 2 + def test_no_request_but_start_requests_present(self, server): + """Test for POST handler checking if everything works fine + if there is no 'request' argument, but 'start_requests' are + present. Not checked above because of the way default test fixtures + are written. + """ + post_data = { + "no_request": {}, + "start_requests": True, + "spider_name": "test_with_sr" + } + post_data.update(post_data) + res = requests.post(server.url("crawl.json"), + json=post_data) + assert res.status_code == 200 + data = res.json() + assert len(data["items"]) == 2 + assert data.get("errors") is None + @pytest.mark.parametrize("method", [ perform_get, perform_post ]) From ebec1aee764c7ef63f17ddb50e86f91c003037ec Mon Sep 17 00:00:00 2001 From: Pawel Miech Date: Fri, 20 Jan 2017 18:36:45 +0100 Subject: [PATCH 17/17] [tests] add another test for POST handler test if missing request parameter in POST will raise 400 --- tests/test_resource_crawl.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_resource_crawl.py b/tests/test_resource_crawl.py index fae90e1..956d93b 100644 --- a/tests/test_resource_crawl.py +++ b/tests/test_resource_crawl.py @@ -145,6 +145,24 @@ def test_no_request_but_start_requests_present(self, server): assert len(data["items"]) == 2 assert data.get("errors") is None + def test_no_request_in_POST_handler(self, server): + """Test for POST handler checking if everything works fine + if there is no 'request' argument at all. + """ + post_data = { + "no_request": {}, + "spider_name": "test_with_sr" + } + post_data.update(post_data) + res = requests.post(server.url("crawl.json"), + json=post_data) + assert res.status_code == 400 + data = res.json() + msg = u"Missing required parameter: 'request'" + assert data["message"] == msg + assert data["status"] == "error" + assert data.get("items") is None + @pytest.mark.parametrize("method", [ perform_get, perform_post ])