Skip to content

Commit

Permalink
Merge pull request #158 from scrapy-plugins/expose-original-info
Browse files Browse the repository at this point in the history
allow to get original response information
  • Loading branch information
kmike committed Jan 16, 2018
2 parents eb0b291 + 74b2d6e commit cde6b3f
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 25 deletions.
7 changes: 6 additions & 1 deletion README.rst
Expand Up @@ -271,6 +271,9 @@ to set ``meta['splash']['args']`` use ``SplashRequest(..., args=myargs)``.
and ``assert(splash:go(..))`` fails with an HTTP error
response.status is also set to HTTP error code.

Original URL, status and headers are available as ``response.real_url``,
``response.splash_response_status`` and ``response.splash_response_headers``.

This option is set to True by default if you use SplashRequest.
``render.json`` and ``execute`` endpoints may not have all the necessary
keys/values in the response.
Expand Down Expand Up @@ -631,7 +634,9 @@ aware of:

3. As seen by Scrapy, response.url is an URL of the Splash server.
scrapy-splash fixes it to be an URL of a requested page.
"Real" URL is still available as ``response.real_url``.
"Real" URL is still available as ``response.real_url``. scrapy-splash also
allows to handle ``response.status`` and ``response.headers`` transparently
on Scrapy side.

4. Some options depend on each other - for example, if you use timeout_
Splash option then you may want to set ``download_timeout``
Expand Down
7 changes: 4 additions & 3 deletions scrapy_splash/middleware.py
Expand Up @@ -23,6 +23,7 @@
json_based_hash,
parse_x_splash_saved_arguments_header,
)
from scrapy_splash.response import get_splash_status, get_splash_headers


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -379,7 +380,7 @@ def process_response(self, request, response, spider):

# handle save_args/load_args
self._process_x_splash_saved_arguments(request, response)
if response.status == 498:
if get_splash_status(response) == 498:
logger.debug("Got HTTP 498 response for {}; "
"sending arguments again.".format(request),
extra={'spider': spider})
Expand All @@ -390,7 +391,7 @@ def process_response(self, request, response, spider):

response = self._change_response_class(request, response)

if self.log_400 and response.status == 400:
if self.log_400 and get_splash_status(response) == 400:
self._log_400(request, response, spider)

return response
Expand Down Expand Up @@ -423,7 +424,7 @@ def _log_400(self, request, response, spider):

def _process_x_splash_saved_arguments(self, request, response):
""" Keep track of arguments saved by Splash. """
saved_args = response.headers.get(b'X-Splash-Saved-Arguments')
saved_args = get_splash_headers(response).get(b'X-Splash-Saved-Arguments')
if not saved_args:
return
saved_args = parse_x_splash_saved_arguments_header(saved_args)
Expand Down
28 changes: 24 additions & 4 deletions scrapy_splash/response.py
Expand Up @@ -11,6 +11,14 @@
from scrapy_splash.utils import headers_to_scrapy


def get_splash_status(resp):
return getattr(resp, 'splash_response_status', resp.status)


def get_splash_headers(resp):
return getattr(resp, 'splash_response_headers', resp.headers)


class _SplashResponseMixin(object):
"""
This mixin fixes response.url and adds response.real_url
Expand All @@ -30,14 +38,23 @@ def __init__(self, url, *args, **kwargs):
if _url is not None:
self.real_url = url
url = _url
self.splash_response_status = kwargs.pop('splash_response_status',
None)
self.splash_response_headers = kwargs.pop('splash_response_headers',
None)
super(_SplashResponseMixin, self).__init__(url, *args, **kwargs)
if self.splash_response_status is None:
self.splash_response_status = self.status
if self.splash_response_headers is None:
self.splash_response_headers = self.headers.copy()

def replace(self, *args, **kwargs):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body', 'request', 'flags',
'real_url']:
'real_url', 'splash_response_status',
'splash_response_headers']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
Expand Down Expand Up @@ -80,11 +97,14 @@ class SplashJsonResponse(SplashResponse):
(['splash']['magic_response'] is not False), several other response
attributes (headers, body, url, status code) are set automatically:
* response.headers are filled from 'headers' keys;
* response.url is set to the value of 'url' key;
* response.url is set to the value of 'url' key, original url is
available as ``responce.real_url``;
* response.headers are filled from 'headers' keys; original headers are
available as ``response.splash_response_headers``;
* response.status is set from the value of 'http_status' key; original
status is available as ``response.splash_response_status``;
* response.body is set to the value of 'html' key,
or to base64-decoded value of 'body' key;
* response.status is set from the value of 'http_status' key.
"""
def __init__(self, *args, **kwargs):
self.cookiejar = None
Expand Down
102 changes: 88 additions & 14 deletions tests/test_integration.py
Expand Up @@ -10,13 +10,14 @@
DEFAULT_SCRIPT = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go{
splash:go{
splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
assert(splash:wait(0.5))
}
local wait = tonumber(splash.args.wait or 0.5)
assert(splash:wait(wait))
local entries = splash:history()
local last_response = entries[#entries].response
Expand All @@ -40,6 +41,11 @@ class HelloWorld(HtmlResource):
extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}


class Http400Resource(HtmlResource):
status_code = 400
html = "Website returns HTTP 400 error"



class ManyCookies(Resource, object):
class SetMyCookie(HtmlResource):
Expand Down Expand Up @@ -94,6 +100,9 @@ def parse(self, response):
resp = items[0]['response']
assert resp.url == url
assert resp.css('body::text').get().strip() == "hello world!"
assert resp.status == resp.splash_response_status == 200
assert resp.headers == resp.splash_response_headers
assert resp.splash_response_headers['Content-Type'] == b"text/html; charset=utf-8"

resp2 = items[1]['response']
assert resp2.body == resp.body
Expand All @@ -118,12 +127,78 @@ def start_requests(self):
assert len(items) == 1
resp = items[0]['response']
assert resp.url == url + "/#foo"
assert resp.status == resp.splash_response_status == 200
assert resp.css('body::text').get().strip() == "hello world!"
assert resp.data['jsvalue'] == 3
assert resp.headers['X-MyHeader'] == b'my value'
assert resp.headers['Content-Type'] == b'text/html'
assert resp.splash_response_headers['Content-Type'] == b'application/json'
assert resp.data['args']['foo'] == 'bar'


@requires_splash
@inlineCallbacks
def test_bad_request(settings):
class BadRequestSpider(ResponseSpider):
custom_settings = {'HTTPERROR_ALLOW_ALL': True}

def start_requests(self):
yield SplashRequest(self.url, endpoint='execute',
args={'lua_source': DEFAULT_SCRIPT, 'wait': 'bar'})

class GoodRequestSpider(ResponseSpider):
custom_settings = {'HTTPERROR_ALLOW_ALL': True}

def start_requests(self):
yield SplashRequest(self.url, endpoint='execute',
args={'lua_source': DEFAULT_SCRIPT})


items, url, crawler = yield crawl_items(BadRequestSpider, HelloWorld,
settings)
resp = items[0]['response']
assert resp.status == 400
assert resp.splash_response_status == 400

items, url, crawler = yield crawl_items(GoodRequestSpider, Http400Resource,
settings)
resp = items[0]['response']
assert resp.status == 400
assert resp.splash_response_status == 200


@requires_splash
@inlineCallbacks
def test_cache_args(settings):

class CacheArgsSpider(ResponseSpider):
def _request(self, url):
return SplashRequest(url, endpoint='execute',
args={'lua_source': DEFAULT_SCRIPT, 'x': 'yy'},
cache_args=['lua_source'])

def start_requests(self):
yield self._request(self.url)

def parse(self, response):
yield {'response': response}
yield self._request(self.url + "#foo")


items, url, crawler = yield crawl_items(CacheArgsSpider, HelloWorld,
settings)
assert len(items) == 2
resp = items[0]['response']
assert b"function main(splash)" in resp.request.body
assert b"yy" in resp.request.body
print(resp.body, resp.request.body)

resp = items[1]['response']
assert b"function main(splash)" not in resp.request.body
assert b"yy" in resp.request.body
print(resp.body, resp.request.body)


@requires_splash
@inlineCallbacks
def test_cookies(settings):
Expand Down Expand Up @@ -171,7 +246,6 @@ def parse_3(self, response):
args={'lua_source': DEFAULT_SCRIPT},
cookies={'bomb': BOMB})


def parse_4(self, response):
yield {'response': response}

Expand All @@ -185,19 +259,19 @@ def _cookie_dict(har_cookies):

# cookie should be sent to remote website, not to Splash
resp = items[0]['response']
splash_headers = resp.request.headers
splash_request_headers = resp.request.headers
cookies = resp.data['args']['cookies']
print(splash_headers)
print(splash_request_headers)
print(cookies)
assert _cookie_dict(cookies) == {
# 'login': '1', # FIXME
'x-set-splash': '1'
}
assert splash_headers.get(b'Cookie') is None
assert splash_request_headers.get(b'Cookie') is None

# new cookie should be also sent to remote website, not to Splash
resp2 = items[1]['response']
splash_headers = resp2.request.headers
splash_request_headers = resp2.request.headers
headers = resp2.data['args']['headers']
cookies = resp2.data['args']['cookies']
assert canonicalize_url(headers['Referer']) == canonicalize_url(url)
Expand All @@ -206,29 +280,29 @@ def _cookie_dict(har_cookies):
'x-set-splash': '1',
'sessionid': 'ABCD'
}
print(splash_headers)
print(splash_request_headers)
print(headers)
print(cookies)
assert splash_headers.get(b'Cookie') is None
assert splash_request_headers.get(b'Cookie') is None

# TODO/FIXME: Cookies fetched when working with Splash should be picked up
# by Scrapy
resp3 = items[2]['response']
splash_headers = resp3.request.headers
cookie_header = splash_headers.get(b'Cookie')
splash_request_headers = resp3.request.headers
cookie_header = splash_request_headers.get(b'Cookie')
assert b'x-set-scrapy=1' in cookie_header
assert b'login=1' in cookie_header
assert b'x-set-splash=1' in cookie_header
# assert b'sessionid=ABCD' in cookie_header # FIXME

# cookie bomb shouldn't cause problems
resp4 = items[3]['response']
splash_headers = resp4.request.headers
splash_request_headers = resp4.request.headers
cookies = resp4.data['args']['cookies']
assert _cookie_dict(cookies) == {
# 'login': '1',
'x-set-splash': '1',
'sessionid': 'ABCD',
'bomb': BOMB,
}
assert splash_headers.get(b'Cookie') is None
assert splash_request_headers.get(b'Cookie') is None
11 changes: 8 additions & 3 deletions tests/test_middleware.py
Expand Up @@ -188,7 +188,8 @@ def cb():
assert response2.text == response2.body_as_unicode() == res_body
assert response2.encoding == 'utf8'
assert response2.headers == {b'Content-Type': [b'application/json']}
assert response2.status == 200
assert response2.splash_response_headers == response2.headers
assert response2.status == response2.splash_response_status == 200


def test_magic_response():
Expand Down Expand Up @@ -233,7 +234,9 @@ def test_magic_response():
b'X-My-Header': [b'foo'],
b'Set-Cookie': [b'bar=baz'],
}
assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
assert resp2.status == 404
assert resp2.splash_response_status == 200
assert resp2.url == "http://exmaple.com/#id42"
assert len(resp2.cookiejar) == 3
cookies = [c for c in resp2.cookiejar]
Expand Down Expand Up @@ -359,7 +362,8 @@ def test_magic_response2():
assert resp2.data == resp_data
assert resp2.body == b'binary data'
assert resp2.headers == {b'Content-Type': [b'text/plain']}
assert resp2.status == 200
assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
assert resp2.status == resp2.splash_response_status == 200
assert resp2.url == "http://example.com/"


Expand Down Expand Up @@ -397,12 +401,13 @@ def test_magic_response_http_error():
"error": 400,
"type": "ScriptError"
}
resp = TextResponse("http://mysplash.example.com/execute",
resp = TextResponse("http://mysplash.example.com/execute", status=400,
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp = mw.process_response(req, resp, None)
assert resp.data == resp_data
assert resp.status == 404
assert resp.splash_response_status == 400
assert resp.url == "http://example.com/foo"


Expand Down
2 changes: 2 additions & 0 deletions tests/utils.py
Expand Up @@ -20,11 +20,13 @@ class HtmlResource(Resource):
content_type = 'text/html'
html = ''
extra_headers = {}
status_code = 200

def render_GET(self, request):
request.setHeader(b'content-type', to_bytes(self.content_type))
for name, value in self.extra_headers.items():
request.setHeader(to_bytes(name), to_bytes(value))
request.setResponseCode(self.status_code)
return to_bytes(self.html)


Expand Down

0 comments on commit cde6b3f

Please sign in to comment.