313 changes: 271 additions & 42 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
# -*- coding: utf-8 -*-
import pytest
import scrapy
from pkg_resources import parse_version
from pytest_twisted import inlineCallbacks
from twisted.web.resource import Resource
from w3lib.url import canonicalize_url
from w3lib.http import basic_auth_header

from scrapy_splash import SplashRequest
from .utils import crawl_items, requires_splash, HtmlResource
from .utils import crawl_items, requires_splash
from .resources import (
HelloWorld,
Http400Resource,
ManyCookies,
HelloWorldProtected,
HelloWorldDisallowByRobots,
HelloWorldDisallowAuth,
)


DEFAULT_SCRIPT = """
function main(splash)
Expand All @@ -16,7 +27,10 @@
http_method=splash.args.http_method,
body=splash.args.body,
}
local wait = tonumber(splash.args.wait or 0.5)
local wait = 0.01
if splash.args.wait ~= nil then
wait = splash.args.wait
end
assert(splash:wait(wait))
local entries = splash:history()
Expand All @@ -34,49 +48,60 @@
"""


class HelloWorld(HtmlResource):
html = """
<html><body><script>document.write('hello world!');</script></body></html>
"""
extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}
class ResponseSpider(scrapy.Spider):
""" Make a request to URL, return Scrapy response """
custom_settings = {
'HTTPERROR_ALLOW_ALL': True,
'ROBOTSTXT_OBEY': True,
}
url = None

def start_requests(self):
yield SplashRequest(self.url)

class Http400Resource(HtmlResource):
status_code = 400
html = "Website returns HTTP 400 error"
def parse(self, response):
yield {'response': response}


class LuaSpider(ResponseSpider):
""" Make a request to URL using default Lua script """
headers = None
splash_headers = None

class ManyCookies(Resource, object):
class SetMyCookie(HtmlResource):
html = "hello!"
extra_headers = {'Set-Cookie': 'login=1'}
def start_requests(self):
yield SplashRequest(self.url,
endpoint='execute',
args={'lua_source': DEFAULT_SCRIPT},
headers=self.headers,
splash_headers=self.splash_headers)

def __init__(self):
super(ManyCookies, self).__init__()
self.putChild(b'', HelloWorld())
self.putChild(b'login', self.SetMyCookie())

class ScrapyAuthSpider(LuaSpider):
""" Spider with incorrect (old, insecure) auth method """
http_user = 'user'
http_pass = 'userpass'


class ResponseSpider(scrapy.Spider):
""" Make a request to URL, return Scrapy response """
url = None
class NonSplashSpider(ResponseSpider):
""" Spider which uses HTTP auth and doesn't use Splash """
http_user = 'user'
http_pass = 'userpass'

def start_requests(self):
yield SplashRequest(self.url)
yield scrapy.Request(self.url)

def parse(self, response):
yield {'response': response}

def assert_single_response(items):
assert len(items) == 1
return items[0]['response']


@requires_splash
@inlineCallbacks
def test_basic(settings):
items, url, crawler = yield crawl_items(ResponseSpider, HelloWorld,
settings)
assert len(items) == 1
resp = items[0]['response']
resp = assert_single_response(items)
assert resp.url == url
assert resp.css('body::text').extract_first().strip() == "hello world!"

Expand Down Expand Up @@ -124,8 +149,7 @@ def start_requests(self):

items, url, crawler = yield crawl_items(LuaScriptSpider, HelloWorld,
settings)
assert len(items) == 1
resp = items[0]['response']
resp = assert_single_response(items)
assert resp.url == url + "/#foo"
assert resp.status == resp.splash_response_status == 200
assert resp.css('body::text').extract_first().strip() == "hello world!"
Expand All @@ -140,29 +164,19 @@ def start_requests(self):
@inlineCallbacks
def test_bad_request(settings):
class BadRequestSpider(ResponseSpider):
custom_settings = {'HTTPERROR_ALLOW_ALL': True}

def start_requests(self):
yield SplashRequest(self.url, endpoint='execute',
args={'lua_source': DEFAULT_SCRIPT, 'wait': 'bar'})

class GoodRequestSpider(ResponseSpider):
custom_settings = {'HTTPERROR_ALLOW_ALL': True}

def start_requests(self):
yield SplashRequest(self.url, endpoint='execute',
args={'lua_source': DEFAULT_SCRIPT})


items, url, crawler = yield crawl_items(BadRequestSpider, HelloWorld,
settings)
resp = items[0]['response']
resp = assert_single_response(items)
assert resp.status == 400
assert resp.splash_response_status == 400

items, url, crawler = yield crawl_items(GoodRequestSpider, Http400Resource,
items, url, crawler = yield crawl_items(LuaSpider, Http400Resource,
settings)
resp = items[0]['response']
resp = assert_single_response(items)
assert resp.status == 400
assert resp.splash_response_status == 200

Expand Down Expand Up @@ -306,3 +320,218 @@ def _cookie_dict(har_cookies):
'bomb': BOMB,
}
assert splash_request_headers.get(b'Cookie') is None


@requires_splash
@inlineCallbacks
def test_access_http_auth(settings):
# website is protected
items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
settings)
response = assert_single_response(items)
assert response.status == 401
assert response.splash_response_status == 200

# header can be used to access it
AUTH_HEADERS = {'Authorization': basic_auth_header('user', 'userpass')}
kwargs = {'headers': AUTH_HEADERS}
items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
settings, kwargs)
response = assert_single_response(items)
assert 'hello' in response.body_as_unicode()
assert response.status == 200
assert response.splash_response_status == 200


@requires_splash
@inlineCallbacks
def test_protected_splash_no_auth(settings_auth):
items, url, crawler = yield crawl_items(LuaSpider, HelloWorld,
settings_auth)
response = assert_single_response(items)
assert 'Unauthorized' in response.body_as_unicode()
assert 'hello' not in response.body_as_unicode()
assert response.status == 401
assert response.splash_response_status == 401


@requires_splash
@inlineCallbacks
def test_protected_splash_manual_headers_auth(settings_auth):
AUTH_HEADERS = {'Authorization': basic_auth_header('user', 'userpass')}
kwargs = {'splash_headers': AUTH_HEADERS}

# auth via splash_headers should work
items, url, crawler = yield crawl_items(LuaSpider, HelloWorld,
settings_auth, kwargs)
response = assert_single_response(items)
assert 'hello' in response.body_as_unicode()
assert response.status == 200
assert response.splash_response_status == 200

# but only for Splash, not for a remote website
items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
settings_auth, kwargs)
response = assert_single_response(items)
assert 'hello' not in response.body_as_unicode()
assert response.status == 401
assert response.splash_response_status == 200


@requires_splash
@inlineCallbacks
def test_protected_splash_settings_auth(settings_auth):
settings_auth['SPLASH_USER'] = 'user'
settings_auth['SPLASH_PASS'] = 'userpass'

# settings works
items, url, crawler = yield crawl_items(LuaSpider, HelloWorld,
settings_auth)
response = assert_single_response(items)
assert 'Unauthorized' not in response.body_as_unicode()
assert 'hello' in response.body_as_unicode()
assert response.status == 200
assert response.splash_response_status == 200

# they can be overridden via splash_headers
bad_auth = {'splash_headers': {'Authorization': 'foo'}}
items, url, crawler = yield crawl_items(LuaSpider, HelloWorld,
settings_auth, bad_auth)
response = assert_single_response(items)
assert response.status == 401
assert response.splash_response_status == 401

# auth error on remote website
items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
settings_auth)
response = assert_single_response(items)
assert response.status == 401
assert response.splash_response_status == 200

# auth both for Splash and for the remote website
REMOTE_AUTH = {'Authorization': basic_auth_header('user', 'userpass')}
remote_auth_kwargs = {'headers': REMOTE_AUTH}
items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
settings_auth, remote_auth_kwargs)
response = assert_single_response(items)
assert response.status == 200
assert response.splash_response_status == 200
assert 'hello' in response.body_as_unicode()

# enable remote auth, but not splash auth - request should fail
del settings_auth['SPLASH_USER']
del settings_auth['SPLASH_PASS']
items, url, crawler = yield crawl_items(LuaSpider,
HelloWorldProtected,
settings_auth, remote_auth_kwargs)
response = assert_single_response(items)
assert response.status == 401
assert response.splash_response_status == 401


@requires_splash
@inlineCallbacks
def test_protected_splash_httpauth_middleware(settings_auth):
# httpauth middleware should enable auth for Splash, for backwards
# compatibility reasons
items, url, crawler = yield crawl_items(ScrapyAuthSpider, HelloWorld,
settings_auth)
response = assert_single_response(items)
assert 'Unauthorized' not in response.body_as_unicode()
assert 'hello' in response.body_as_unicode()
assert response.status == 200
assert response.splash_response_status == 200

# but not for a remote website
items, url, crawler = yield crawl_items(ScrapyAuthSpider,
HelloWorldProtected,
settings_auth)
response = assert_single_response(items)
assert 'hello' not in response.body_as_unicode()
assert response.status == 401
assert response.splash_response_status == 200

# headers shouldn't be sent to robots.txt file
items, url, crawler = yield crawl_items(ScrapyAuthSpider,
HelloWorldDisallowAuth,
settings_auth)
response = assert_single_response(items)
assert 'hello' in response.body_as_unicode()
assert response.status == 200
assert response.splash_response_status == 200

# httpauth shouldn't be disabled for non-Splash requests
items, url, crawler = yield crawl_items(NonSplashSpider,
HelloWorldProtected,
settings_auth)
response = assert_single_response(items)
assert 'hello' in response.body_as_unicode()
assert response.status == 200
assert not hasattr(response, 'splash_response_status')


@pytest.mark.xfail(
parse_version(scrapy.__version__) < parse_version("1.1"),
reason="https://github.com/scrapy/scrapy/issues/1471",
strict=True,
run=True,
)
@requires_splash
@inlineCallbacks
def test_robotstxt_can_work(settings_auth):

def assert_robots_disabled(items):
response = assert_single_response(items)
assert response.status == response.splash_response_status == 200
assert b'hello' in response.body

def assert_robots_enabled(items, crawler):
assert len(items) == 0
assert crawler.stats.get_value('downloader/exception_type_count/scrapy.exceptions.IgnoreRequest') == 1

def _crawl_items(spider, resource):
return crawl_items(
spider,
resource,
settings_auth,
url_path='/', # https://github.com/scrapy/protego/issues/17
)

# when old auth method is used, robots.txt should be disabled
items, url, crawler = yield _crawl_items(ScrapyAuthSpider,
HelloWorldDisallowByRobots)
assert_robots_disabled(items)

# but robots.txt should still work for non-Splash requests
items, url, crawler = yield _crawl_items(NonSplashSpider,
HelloWorldDisallowByRobots)
assert_robots_enabled(items, crawler)

# robots.txt should work when a proper auth method is used
settings_auth['SPLASH_USER'] = 'user'
settings_auth['SPLASH_PASS'] = 'userpass'
items, url, crawler = yield _crawl_items(LuaSpider,
HelloWorldDisallowByRobots)
assert_robots_enabled(items, crawler)

# disable robotstxt middleware - robots middleware shouldn't work
class DontObeyRobotsSpider(LuaSpider):
custom_settings = {
'HTTPERROR_ALLOW_ALL': True,
'ROBOTSTXT_OBEY': False,
}
items, url, crawler = yield _crawl_items(DontObeyRobotsSpider,
HelloWorldDisallowByRobots)
assert_robots_disabled(items)

# disable robotstxt middleware via request meta
class MetaDontObeyRobotsSpider(ResponseSpider):
def start_requests(self):
yield SplashRequest(self.url,
endpoint='execute',
meta={'dont_obey_robotstxt': True},
args={'lua_source': DEFAULT_SCRIPT})

items, url, crawler = yield _crawl_items(MetaDontObeyRobotsSpider,
HelloWorldDisallowByRobots)
assert_robots_disabled(items)
55 changes: 42 additions & 13 deletions tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def _get_crawler(settings_dict):
return crawler


def _get_mw():
crawler = _get_crawler({})
def _get_mw(settings_dict=None):
crawler = _get_crawler(settings_dict or {})
return SplashMiddleware.from_crawler(crawler)


Expand Down Expand Up @@ -70,6 +70,7 @@ def test_splash_request():
# check request preprocessing
req2 = cookie_mw.process_request(req, None) or req
req2 = mw.process_request(req2, None) or req2

assert req2 is not None
assert req2 is not req
assert req2.url == "http://127.0.0.1:8050/render.html"
Expand Down Expand Up @@ -139,7 +140,9 @@ def cb():
headers={'X-My-Header': 'value'}
)
req2 = cookie_mw.process_request(req, None) or req
req2 = mw.process_request(req2, None)
req2 = mw.process_request(req2, None) or req2

assert req2.meta['ajax_crawlable'] is True
assert req2.meta['splash'] == {
'endpoint': 'execute',
'splash_url': "http://mysplash.example.com",
Expand Down Expand Up @@ -348,7 +351,7 @@ def test_magic_response2():
mw = _get_mw()
req = SplashRequest('http://example.com/', magic_response=True,
headers={'foo': 'bar'}, dont_send_headers=True)
req = mw.process_request(req, None)
req = mw.process_request(req, None) or req
assert 'headers' not in req.meta['splash']['args']

resp_data = {
Expand All @@ -372,7 +375,7 @@ def test_unicode_url():
req = SplashRequest(
# note unicode URL
u"http://example.com/", endpoint='execute')
req2 = mw.process_request(req, None)
req2 = mw.process_request(req, None) or req
res = {'html': '<html><body>Hello</body></html>'}
res_body = json.dumps(res)
response = TextResponse("http://mysplash.example.com/execute",
Expand All @@ -387,7 +390,7 @@ def test_unicode_url():
def test_magic_response_http_error():
mw = _get_mw()
req = SplashRequest('http://example.com/foo')
req = mw.process_request(req, None)
req = mw.process_request(req, None) or req

resp_data = {
"info": {
Expand All @@ -414,7 +417,7 @@ def test_magic_response_http_error():
def test_change_response_class_to_text():
mw = _get_mw()
req = SplashRequest('http://example.com/', magic_response=True)
req = mw.process_request(req, None)
req = mw.process_request(req, None) or req
# Such response can come when downloading a file,
# or returning splash:html(): the headers say it's binary,
# but it can be decoded so it becomes a TextResponse.
Expand All @@ -437,7 +440,7 @@ def test_change_response_class_to_json_binary():
# but this is ok because magic_response presumes we are expecting
# a valid splash json response.
req = SplashRequest('http://example.com/', magic_response=False)
req = mw.process_request(req, None)
req = mw.process_request(req, None) or req
resp = Response('http://mysplash.example.com/execute',
headers={b'Content-Type': b'application/json'},
body=b'non-decodable data: \x98\x11\xe7\x17\x8f',
Expand Down Expand Up @@ -474,7 +477,7 @@ def _get_req():
# first call
req = _get_req()
req = cookie_mw.process_request(req, spider) or req
req = mw.process_request(req, spider)
req = mw.process_request(req, spider) or req
req = cache_mw.process_request(req, spider) or req
assert isinstance(req, scrapy.Request) # first call; the cache is empty

Expand All @@ -498,7 +501,7 @@ def _get_req():
# second call
req = _get_req()
req = cookie_mw.process_request(req, spider) or req
req = mw.process_request(req, spider)
req = mw.process_request(req, spider) or req
cached_resp = cache_mw.process_request(req, spider) or req

# response should be from cache:
Expand Down Expand Up @@ -666,6 +669,7 @@ def test_override_splash_url():
}
})
req = mw.process_request(req1, None)
req = mw.process_request(req, None) or req
assert req.url == 'http://splash.example.com/render.png'
assert json.loads(to_native_str(req.body)) == {'url': req1.url}

Expand All @@ -677,6 +681,7 @@ def test_url_with_fragment():
'splash': {'args': {'url': url}}
})
req = mw.process_request(req, None)
req = mw.process_request(req, None) or req
assert json.loads(to_native_str(req.body)) == {'url': url}


Expand All @@ -685,6 +690,7 @@ def test_splash_request_url_with_fragment():
url = "http://example.com#id1"
req = SplashRequest(url)
req = mw.process_request(req, None)
req = mw.process_request(req, None) or req
assert json.loads(to_native_str(req.body)) == {'url': url}


Expand Down Expand Up @@ -740,7 +746,7 @@ def test_slot_policy_per_domain():

def test_slot_policy_scrapy_default():
mw = _get_mw()
req = scrapy.Request("http://example.com", meta = {'splash': {
req = scrapy.Request("http://example.com", meta={'splash': {
'slot_policy': scrapy_splash.SlotPolicy.SCRAPY_DEFAULT
}})
req = mw.process_request(req, None)
Expand All @@ -749,7 +755,7 @@ def test_slot_policy_scrapy_default():

def test_adjust_timeout():
mw = _get_mw()
req1 = scrapy.Request("http://example.com", meta = {
req1 = scrapy.Request("http://example.com", meta={
'splash': {'args': {'timeout': 60, 'html': 1}},

# download_timeout is always present,
Expand All @@ -759,9 +765,32 @@ def test_adjust_timeout():
req1 = mw.process_request(req1, None)
assert req1.meta['download_timeout'] > 60

req2 = scrapy.Request("http://example.com", meta = {
req2 = scrapy.Request("http://example.com", meta={
'splash': {'args': {'html': 1}},
'download_timeout': 30,
})
req2 = mw.process_request(req2, None)
assert req2.meta['download_timeout'] == 30


def test_auth():
def assert_auth_header(user, pwd, header):
mw = _get_mw({'SPLASH_USER': user, 'SPLASH_PASS': pwd})
req = mw.process_request(SplashRequest("http://example.com"), None)
assert 'Authorization' in req.headers
assert req.headers['Authorization'] == header

def assert_no_auth_header(user, pwd):
if user is not None or pwd is not None:
mw = _get_mw({'SPLASH_USER': user, 'SPLASH_PASS': pwd})
else:
mw = _get_mw()
req = mw.process_request(SplashRequest("http://example.com"), None)
assert 'Authorization' not in req.headers

assert_auth_header('root', '', b'Basic cm9vdDo=')
assert_auth_header('root', 'pwd', b'Basic cm9vdDpwd2Q=')
assert_auth_header('', 'pwd', b'Basic OnB3ZA==')

assert_no_auth_header('', '')
assert_no_auth_header(None, None)
33 changes: 12 additions & 21 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
import pytest
from pytest_twisted import inlineCallbacks
from twisted.internet.defer import returnValue
from twisted.web.resource import Resource
from scrapy.crawler import Crawler

from scrapy_splash.utils import to_bytes
from tests.mockserver import MockServer
from .mockserver import MockServer


requires_splash = pytest.mark.skipif(
Expand All @@ -16,33 +14,26 @@
)


class HtmlResource(Resource):
isLeaf = True
content_type = 'text/html'
html = ''
extra_headers = {}
status_code = 200

def render_GET(self, request):
request.setHeader(b'content-type', to_bytes(self.content_type))
for name, value in self.extra_headers.items():
request.setHeader(to_bytes(name), to_bytes(value))
request.setResponseCode(self.status_code)
return to_bytes(self.html)


@inlineCallbacks
def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None):
def crawl_items(
spider_cls,
resource_cls,
settings,
spider_kwargs=None,
url_path="",
):
""" Use spider_cls to crawl resource_cls. URL of the resource is passed
to the spider as ``url`` argument.
Return ``(items, resource_url, crawler)`` tuple.
"""
spider_kwargs = {} if spider_kwargs is None else spider_kwargs
crawler = make_crawler(spider_cls, settings)
with MockServer(resource_cls) as s:
root_url = s.root_url
print("mock server", s.root_url)
root_url = s.root_url + url_path
yield crawler.crawl(url=root_url, **spider_kwargs)
result = crawler.spider.collected_items, s.root_url, crawler
items = getattr(crawler.spider, 'collected_items', [])
result = items, root_url, crawler
returnValue(result)


Expand Down