@@ -1,11 +1,22 @@
# -*- coding: utf-8 -*-
import pytest
import scrapy
from pkg_resources import parse_version
from pytest_twisted import inlineCallbacks
from twisted .web .resource import Resource
from w3lib .url import canonicalize_url
from w3lib .http import basic_auth_header
from scrapy_splash import SplashRequest
from .utils import crawl_items , requires_splash , HtmlResource
from .utils import crawl_items , requires_splash
from .resources import (
HelloWorld ,
Http400Resource ,
ManyCookies ,
HelloWorldProtected ,
HelloWorldDisallowByRobots ,
HelloWorldDisallowAuth ,
)
DEFAULT_SCRIPT = """
function main(splash)
Expand All
@@ -16,7 +27,10 @@
http_method=splash.args.http_method,
body=splash.args.body,
}
local wait = tonumber(splash.args.wait or 0.5)
local wait = 0.01
if splash.args.wait ~= nil then
wait = splash.args.wait
end
assert(splash:wait(wait))
local entries = splash:history()
Expand All
@@ -34,49 +48,60 @@
"""
class HelloWorld (HtmlResource ):
html = """
<html><body><script>document.write('hello world!');</script></body></html>
"""
extra_headers = {'X-MyHeader' : 'my value' , 'Set-Cookie' : 'sessionid=ABCD' }
class ResponseSpider (scrapy .Spider ):
""" Make a request to URL, return Scrapy response """
custom_settings = {
'HTTPERROR_ALLOW_ALL' : True ,
'ROBOTSTXT_OBEY' : True ,
}
url = None
def start_requests (self ):
yield SplashRequest (self .url )
class Http400Resource (HtmlResource ):
status_code = 400
html = "Website returns HTTP 400 error"
def parse (self , response ):
yield {'response' : response }
class LuaSpider (ResponseSpider ):
""" Make a request to URL using default Lua script """
headers = None
splash_headers = None
class ManyCookies (Resource , object ):
class SetMyCookie (HtmlResource ):
html = "hello!"
extra_headers = {'Set-Cookie' : 'login=1' }
def start_requests (self ):
yield SplashRequest (self .url ,
endpoint = 'execute' ,
args = {'lua_source' : DEFAULT_SCRIPT },
headers = self .headers ,
splash_headers = self .splash_headers )
def __init__ (self ):
super (ManyCookies , self ).__init__ ()
self .putChild (b'' , HelloWorld ())
self .putChild (b'login' , self .SetMyCookie ())
class ScrapyAuthSpider (LuaSpider ):
""" Spider with incorrect (old, insecure) auth method """
http_user = 'user'
http_pass = 'userpass'
class ResponseSpider (scrapy .Spider ):
""" Make a request to URL, return Scrapy response """
url = None
class NonSplashSpider (ResponseSpider ):
""" Spider which uses HTTP auth and doesn't use Splash """
http_user = 'user'
http_pass = 'userpass'
def start_requests (self ):
yield SplashRequest (self .url )
yield scrapy . Request (self .url )
def parse (self , response ):
yield {'response' : response }
def assert_single_response (items ):
assert len (items ) == 1
return items [0 ]['response' ]
@requires_splash
@inlineCallbacks
def test_basic (settings ):
items , url , crawler = yield crawl_items (ResponseSpider , HelloWorld ,
settings )
assert len (items ) == 1
resp = items [0 ]['response' ]
resp = assert_single_response (items )
assert resp .url == url
assert resp .css ('body::text' ).extract_first ().strip () == "hello world!"
Expand Down
Expand Up
@@ -124,8 +149,7 @@ def start_requests(self):
items , url , crawler = yield crawl_items (LuaScriptSpider , HelloWorld ,
settings )
assert len (items ) == 1
resp = items [0 ]['response' ]
resp = assert_single_response (items )
assert resp .url == url + "/#foo"
assert resp .status == resp .splash_response_status == 200
assert resp .css ('body::text' ).extract_first ().strip () == "hello world!"
Expand All
@@ -140,29 +164,19 @@ def start_requests(self):
@inlineCallbacks
def test_bad_request (settings ):
class BadRequestSpider (ResponseSpider ):
custom_settings = {'HTTPERROR_ALLOW_ALL' : True }
def start_requests (self ):
yield SplashRequest (self .url , endpoint = 'execute' ,
args = {'lua_source' : DEFAULT_SCRIPT , 'wait' : 'bar' })
class GoodRequestSpider (ResponseSpider ):
custom_settings = {'HTTPERROR_ALLOW_ALL' : True }
def start_requests (self ):
yield SplashRequest (self .url , endpoint = 'execute' ,
args = {'lua_source' : DEFAULT_SCRIPT })
items , url , crawler = yield crawl_items (BadRequestSpider , HelloWorld ,
settings )
resp = items [ 0 ][ 'response' ]
resp = assert_single_response ( items )
assert resp .status == 400
assert resp .splash_response_status == 400
items , url , crawler = yield crawl_items (GoodRequestSpider , Http400Resource ,
items , url , crawler = yield crawl_items (LuaSpider , Http400Resource ,
settings )
resp = items [ 0 ][ 'response' ]
resp = assert_single_response ( items )
assert resp .status == 400
assert resp .splash_response_status == 200
Expand Down
Expand Up
@@ -306,3 +320,218 @@ def _cookie_dict(har_cookies):
'bomb' : BOMB ,
}
assert splash_request_headers .get (b'Cookie' ) is None
@requires_splash
@inlineCallbacks
def test_access_http_auth (settings ):
# website is protected
items , url , crawler = yield crawl_items (LuaSpider , HelloWorldProtected ,
settings )
response = assert_single_response (items )
assert response .status == 401
assert response .splash_response_status == 200
# header can be used to access it
AUTH_HEADERS = {'Authorization' : basic_auth_header ('user' , 'userpass' )}
kwargs = {'headers' : AUTH_HEADERS }
items , url , crawler = yield crawl_items (LuaSpider , HelloWorldProtected ,
settings , kwargs )
response = assert_single_response (items )
assert 'hello' in response .body_as_unicode ()
assert response .status == 200
assert response .splash_response_status == 200
@requires_splash
@inlineCallbacks
def test_protected_splash_no_auth (settings_auth ):
items , url , crawler = yield crawl_items (LuaSpider , HelloWorld ,
settings_auth )
response = assert_single_response (items )
assert 'Unauthorized' in response .body_as_unicode ()
assert 'hello' not in response .body_as_unicode ()
assert response .status == 401
assert response .splash_response_status == 401
@requires_splash
@inlineCallbacks
def test_protected_splash_manual_headers_auth (settings_auth ):
AUTH_HEADERS = {'Authorization' : basic_auth_header ('user' , 'userpass' )}
kwargs = {'splash_headers' : AUTH_HEADERS }
# auth via splash_headers should work
items , url , crawler = yield crawl_items (LuaSpider , HelloWorld ,
settings_auth , kwargs )
response = assert_single_response (items )
assert 'hello' in response .body_as_unicode ()
assert response .status == 200
assert response .splash_response_status == 200
# but only for Splash, not for a remote website
items , url , crawler = yield crawl_items (LuaSpider , HelloWorldProtected ,
settings_auth , kwargs )
response = assert_single_response (items )
assert 'hello' not in response .body_as_unicode ()
assert response .status == 401
assert response .splash_response_status == 200
@requires_splash
@inlineCallbacks
def test_protected_splash_settings_auth (settings_auth ):
settings_auth ['SPLASH_USER' ] = 'user'
settings_auth ['SPLASH_PASS' ] = 'userpass'
# settings works
items , url , crawler = yield crawl_items (LuaSpider , HelloWorld ,
settings_auth )
response = assert_single_response (items )
assert 'Unauthorized' not in response .body_as_unicode ()
assert 'hello' in response .body_as_unicode ()
assert response .status == 200
assert response .splash_response_status == 200
# they can be overridden via splash_headers
bad_auth = {'splash_headers' : {'Authorization' : 'foo' }}
items , url , crawler = yield crawl_items (LuaSpider , HelloWorld ,
settings_auth , bad_auth )
response = assert_single_response (items )
assert response .status == 401
assert response .splash_response_status == 401
# auth error on remote website
items , url , crawler = yield crawl_items (LuaSpider , HelloWorldProtected ,
settings_auth )
response = assert_single_response (items )
assert response .status == 401
assert response .splash_response_status == 200
# auth both for Splash and for the remote website
REMOTE_AUTH = {'Authorization' : basic_auth_header ('user' , 'userpass' )}
remote_auth_kwargs = {'headers' : REMOTE_AUTH }
items , url , crawler = yield crawl_items (LuaSpider , HelloWorldProtected ,
settings_auth , remote_auth_kwargs )
response = assert_single_response (items )
assert response .status == 200
assert response .splash_response_status == 200
assert 'hello' in response .body_as_unicode ()
# enable remote auth, but not splash auth - request should fail
del settings_auth ['SPLASH_USER' ]
del settings_auth ['SPLASH_PASS' ]
items , url , crawler = yield crawl_items (LuaSpider ,
HelloWorldProtected ,
settings_auth , remote_auth_kwargs )
response = assert_single_response (items )
assert response .status == 401
assert response .splash_response_status == 401
@requires_splash
@inlineCallbacks
def test_protected_splash_httpauth_middleware (settings_auth ):
# httpauth middleware should enable auth for Splash, for backwards
# compatibility reasons
items , url , crawler = yield crawl_items (ScrapyAuthSpider , HelloWorld ,
settings_auth )
response = assert_single_response (items )
assert 'Unauthorized' not in response .body_as_unicode ()
assert 'hello' in response .body_as_unicode ()
assert response .status == 200
assert response .splash_response_status == 200
# but not for a remote website
items , url , crawler = yield crawl_items (ScrapyAuthSpider ,
HelloWorldProtected ,
settings_auth )
response = assert_single_response (items )
assert 'hello' not in response .body_as_unicode ()
assert response .status == 401
assert response .splash_response_status == 200
# headers shouldn't be sent to robots.txt file
items , url , crawler = yield crawl_items (ScrapyAuthSpider ,
HelloWorldDisallowAuth ,
settings_auth )
response = assert_single_response (items )
assert 'hello' in response .body_as_unicode ()
assert response .status == 200
assert response .splash_response_status == 200
# httpauth shouldn't be disabled for non-Splash requests
items , url , crawler = yield crawl_items (NonSplashSpider ,
HelloWorldProtected ,
settings_auth )
response = assert_single_response (items )
assert 'hello' in response .body_as_unicode ()
assert response .status == 200
assert not hasattr (response , 'splash_response_status' )
@pytest .mark .xfail (
parse_version (scrapy .__version__ ) < parse_version ("1.1" ),
reason = "https://github.com/scrapy/scrapy/issues/1471" ,
strict = True ,
run = True ,
)
@requires_splash
@inlineCallbacks
def test_robotstxt_can_work (settings_auth ):
def assert_robots_disabled (items ):
response = assert_single_response (items )
assert response .status == response .splash_response_status == 200
assert b'hello' in response .body
def assert_robots_enabled (items , crawler ):
assert len (items ) == 0
assert crawler .stats .get_value ('downloader/exception_type_count/scrapy.exceptions.IgnoreRequest' ) == 1
def _crawl_items (spider , resource ):
return crawl_items (
spider ,
resource ,
settings_auth ,
url_path = '/' , # https://github.com/scrapy/protego/issues/17
)
# when old auth method is used, robots.txt should be disabled
items , url , crawler = yield _crawl_items (ScrapyAuthSpider ,
HelloWorldDisallowByRobots )
assert_robots_disabled (items )
# but robots.txt should still work for non-Splash requests
items , url , crawler = yield _crawl_items (NonSplashSpider ,
HelloWorldDisallowByRobots )
assert_robots_enabled (items , crawler )
# robots.txt should work when a proper auth method is used
settings_auth ['SPLASH_USER' ] = 'user'
settings_auth ['SPLASH_PASS' ] = 'userpass'
items , url , crawler = yield _crawl_items (LuaSpider ,
HelloWorldDisallowByRobots )
assert_robots_enabled (items , crawler )
# disable robotstxt middleware - robots middleware shouldn't work
class DontObeyRobotsSpider (LuaSpider ):
custom_settings = {
'HTTPERROR_ALLOW_ALL' : True ,
'ROBOTSTXT_OBEY' : False ,
}
items , url , crawler = yield _crawl_items (DontObeyRobotsSpider ,
HelloWorldDisallowByRobots )
assert_robots_disabled (items )
# disable robotstxt middleware via request meta
class MetaDontObeyRobotsSpider (ResponseSpider ):
def start_requests (self ):
yield SplashRequest (self .url ,
endpoint = 'execute' ,
meta = {'dont_obey_robotstxt' : True },
args = {'lua_source' : DEFAULT_SCRIPT })
items , url , crawler = yield _crawl_items (MetaDontObeyRobotsSpider ,
HelloWorldDisallowByRobots )
assert_robots_disabled (items )