Merge pull request #158 from scrapy-plugins/expose-original-info

allow to get original response information
scrapy-plugins · Jan 16, 2018 · cde6b3f · cde6b3f
2 parents eb0b291 + 74b2d6e
commit cde6b3f
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 25 deletions.
diff --git a/README.rst b/README.rst
@@ -271,6 +271,9 @@ to set ``meta['splash']['args']`` use ``SplashRequest(..., args=myargs)``.
     and ``assert(splash:go(..))`` fails with an HTTP error
     response.status is also set to HTTP error code.
 
+  Original URL, status and headers are available as ``response.real_url``,
+  ``response.splash_response_status`` and ``response.splash_response_headers``.
+
   This option is set to True by default if you use SplashRequest.
   ``render.json`` and ``execute`` endpoints may not have all the necessary
   keys/values in the response.
@@ -631,7 +634,9 @@ aware of:
 
 3. As seen by Scrapy, response.url is an URL of the Splash server.
    scrapy-splash fixes it to be an URL of a requested page.
-   "Real" URL is still available as ``response.real_url``.
+   "Real" URL is still available as ``response.real_url``. scrapy-splash also
+   allows to handle ``response.status`` and ``response.headers`` transparently
+   on Scrapy side.
 
 4. Some options depend on each other - for example, if you use timeout_
    Splash option then you may want to set ``download_timeout``

diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py
@@ -23,6 +23,7 @@
     json_based_hash,
     parse_x_splash_saved_arguments_header,
 )
+from scrapy_splash.response import get_splash_status, get_splash_headers
 
 
 logger = logging.getLogger(__name__)
@@ -379,7 +380,7 @@ def process_response(self, request, response, spider):
 
         # handle save_args/load_args
         self._process_x_splash_saved_arguments(request, response)
-        if response.status == 498:
+        if get_splash_status(response) == 498:
             logger.debug("Got HTTP 498 response for {}; "
                          "sending arguments again.".format(request),
                          extra={'spider': spider})
@@ -390,7 +391,7 @@ def process_response(self, request, response, spider):
 
         response = self._change_response_class(request, response)
 
-        if self.log_400 and response.status == 400:
+        if self.log_400 and get_splash_status(response) == 400:
             self._log_400(request, response, spider)
 
         return response
@@ -423,7 +424,7 @@ def _log_400(self, request, response, spider):
 
     def _process_x_splash_saved_arguments(self, request, response):
         """ Keep track of arguments saved by Splash. """
-        saved_args = response.headers.get(b'X-Splash-Saved-Arguments')
+        saved_args = get_splash_headers(response).get(b'X-Splash-Saved-Arguments')
         if not saved_args:
             return
         saved_args = parse_x_splash_saved_arguments_header(saved_args)

diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py
@@ -11,6 +11,14 @@
 from scrapy_splash.utils import headers_to_scrapy
 
 
+def get_splash_status(resp):
+    return getattr(resp, 'splash_response_status', resp.status)
+
+
+def get_splash_headers(resp):
+    return getattr(resp, 'splash_response_headers', resp.headers)
+
+
 class _SplashResponseMixin(object):
     """
     This mixin fixes response.url and adds response.real_url
@@ -30,14 +38,23 @@ def __init__(self, url, *args, **kwargs):
             if _url is not None:
                 self.real_url = url
                 url = _url
+        self.splash_response_status = kwargs.pop('splash_response_status',
+                                                 None)
+        self.splash_response_headers = kwargs.pop('splash_response_headers',
+                                                  None)
         super(_SplashResponseMixin, self).__init__(url, *args, **kwargs)
+        if self.splash_response_status is None:
+            self.splash_response_status = self.status
+        if self.splash_response_headers is None:
+            self.splash_response_headers = self.headers.copy()
 
     def replace(self, *args, **kwargs):
         """Create a new Response with the same attributes except for those
         given new values.
         """
         for x in ['url', 'status', 'headers', 'body', 'request', 'flags',
-                  'real_url']:
+                  'real_url', 'splash_response_status',
+                  'splash_response_headers']:
             kwargs.setdefault(x, getattr(self, x))
         cls = kwargs.pop('cls', self.__class__)
         return cls(*args, **kwargs)
@@ -80,11 +97,14 @@ class SplashJsonResponse(SplashResponse):
     (['splash']['magic_response'] is not False), several other response
     attributes (headers, body, url, status code) are set automatically:
 
-    * response.headers are filled from 'headers' keys;
-    * response.url is set to the value of 'url' key;
+    * response.url is set to the value of 'url' key, original url is
+      available as ``responce.real_url``;
+    * response.headers are filled from 'headers' keys; original headers are
+      available as ``response.splash_response_headers``;
+    * response.status is set from the value of 'http_status' key; original
+      status is available as ``response.splash_response_status``;
     * response.body is set to the value of 'html' key,
       or to base64-decoded value of 'body' key;
-    * response.status is set from the value of 'http_status' key.
     """
     def __init__(self, *args, **kwargs):
         self.cookiejar = None

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -10,13 +10,14 @@
 DEFAULT_SCRIPT = """
 function main(splash)
   splash:init_cookies(splash.args.cookies)
-  assert(splash:go{
+  splash:go{
     splash.args.url,
     headers=splash.args.headers,
     http_method=splash.args.http_method,
     body=splash.args.body,
-    })
-  assert(splash:wait(0.5))
+  }
+  local wait = tonumber(splash.args.wait or 0.5)  
+  assert(splash:wait(wait))
 
   local entries = splash:history()
   local last_response = entries[#entries].response
@@ -40,6 +41,11 @@ class HelloWorld(HtmlResource):
     extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}
 
 
+class Http400Resource(HtmlResource):
+    status_code = 400
+    html = "Website returns HTTP 400 error"
+
+
 
 class ManyCookies(Resource, object):
     class SetMyCookie(HtmlResource):
@@ -94,6 +100,9 @@ def parse(self, response):
     resp = items[0]['response']
     assert resp.url == url
     assert resp.css('body::text').get().strip() == "hello world!"
+    assert resp.status == resp.splash_response_status == 200
+    assert resp.headers == resp.splash_response_headers
+    assert resp.splash_response_headers['Content-Type'] == b"text/html; charset=utf-8"
 
     resp2 = items[1]['response']
     assert resp2.body == resp.body
@@ -118,12 +127,78 @@ def start_requests(self):
     assert len(items) == 1
     resp = items[0]['response']
     assert resp.url == url + "/#foo"
+    assert resp.status == resp.splash_response_status == 200
     assert resp.css('body::text').get().strip() == "hello world!"
     assert resp.data['jsvalue'] == 3
     assert resp.headers['X-MyHeader'] == b'my value'
+    assert resp.headers['Content-Type'] == b'text/html'
+    assert resp.splash_response_headers['Content-Type'] == b'application/json'
     assert resp.data['args']['foo'] == 'bar'
 
 
+@requires_splash
+@inlineCallbacks
+def test_bad_request(settings):
+    class BadRequestSpider(ResponseSpider):
+        custom_settings = {'HTTPERROR_ALLOW_ALL': True}
+
+        def start_requests(self):
+            yield SplashRequest(self.url, endpoint='execute',
+                                args={'lua_source': DEFAULT_SCRIPT, 'wait': 'bar'})
+
+    class GoodRequestSpider(ResponseSpider):
+        custom_settings = {'HTTPERROR_ALLOW_ALL': True}
+
+        def start_requests(self):
+            yield SplashRequest(self.url, endpoint='execute',
+                                args={'lua_source': DEFAULT_SCRIPT})
+
+
+    items, url, crawler = yield crawl_items(BadRequestSpider, HelloWorld,
+                                            settings)
+    resp = items[0]['response']
+    assert resp.status == 400
+    assert resp.splash_response_status == 400
+
+    items, url, crawler = yield crawl_items(GoodRequestSpider, Http400Resource,
+                                            settings)
+    resp = items[0]['response']
+    assert resp.status == 400
+    assert resp.splash_response_status == 200
+
+
+@requires_splash
+@inlineCallbacks
+def test_cache_args(settings):
+
+    class CacheArgsSpider(ResponseSpider):
+        def _request(self, url):
+            return SplashRequest(url, endpoint='execute',
+                                 args={'lua_source': DEFAULT_SCRIPT, 'x': 'yy'},
+                                 cache_args=['lua_source'])
+
+        def start_requests(self):
+            yield self._request(self.url)
+
+        def parse(self, response):
+            yield {'response': response}
+            yield self._request(self.url + "#foo")
+
+
+    items, url, crawler = yield crawl_items(CacheArgsSpider, HelloWorld,
+                                            settings)
+    assert len(items) == 2
+    resp = items[0]['response']
+    assert b"function main(splash)" in resp.request.body
+    assert b"yy" in resp.request.body
+    print(resp.body, resp.request.body)
+
+    resp = items[1]['response']
+    assert b"function main(splash)" not in resp.request.body
+    assert b"yy" in resp.request.body
+    print(resp.body, resp.request.body)
+
+
 @requires_splash
 @inlineCallbacks
 def test_cookies(settings):
@@ -171,7 +246,6 @@ def parse_3(self, response):
                                 args={'lua_source': DEFAULT_SCRIPT},
                                 cookies={'bomb': BOMB})
 
-
         def parse_4(self, response):
             yield {'response': response}
 
@@ -185,19 +259,19 @@ def _cookie_dict(har_cookies):
 
     # cookie should be sent to remote website, not to Splash
     resp = items[0]['response']
-    splash_headers = resp.request.headers
+    splash_request_headers = resp.request.headers
     cookies = resp.data['args']['cookies']
-    print(splash_headers)
+    print(splash_request_headers)
     print(cookies)
     assert _cookie_dict(cookies) == {
         # 'login': '1',   # FIXME
         'x-set-splash': '1'
     }
-    assert splash_headers.get(b'Cookie') is None
+    assert splash_request_headers.get(b'Cookie') is None
 
     # new cookie should be also sent to remote website, not to Splash
     resp2 = items[1]['response']
-    splash_headers = resp2.request.headers
+    splash_request_headers = resp2.request.headers
     headers = resp2.data['args']['headers']
     cookies = resp2.data['args']['cookies']
     assert canonicalize_url(headers['Referer']) == canonicalize_url(url)
@@ -206,29 +280,29 @@ def _cookie_dict(har_cookies):
         'x-set-splash': '1',
         'sessionid': 'ABCD'
     }
-    print(splash_headers)
+    print(splash_request_headers)
     print(headers)
     print(cookies)
-    assert splash_headers.get(b'Cookie') is None
+    assert splash_request_headers.get(b'Cookie') is None
 
     # TODO/FIXME: Cookies fetched when working with Splash should be picked up
     # by Scrapy
     resp3 = items[2]['response']
-    splash_headers = resp3.request.headers
-    cookie_header = splash_headers.get(b'Cookie')
+    splash_request_headers = resp3.request.headers
+    cookie_header = splash_request_headers.get(b'Cookie')
     assert b'x-set-scrapy=1' in cookie_header
     assert b'login=1' in cookie_header
     assert b'x-set-splash=1' in cookie_header
     # assert b'sessionid=ABCD' in cookie_header  # FIXME
 
     # cookie bomb shouldn't cause problems
     resp4 = items[3]['response']
-    splash_headers = resp4.request.headers
+    splash_request_headers = resp4.request.headers
     cookies = resp4.data['args']['cookies']
     assert _cookie_dict(cookies) == {
         # 'login': '1',
         'x-set-splash': '1',
         'sessionid': 'ABCD',
         'bomb': BOMB,
     }
-    assert splash_headers.get(b'Cookie') is None
+    assert splash_request_headers.get(b'Cookie') is None
diff --git a/tests/test_middleware.py b/tests/test_middleware.py
@@ -188,7 +188,8 @@ def cb():
     assert response2.text == response2.body_as_unicode() == res_body
     assert response2.encoding == 'utf8'
     assert response2.headers == {b'Content-Type': [b'application/json']}
-    assert response2.status == 200
+    assert response2.splash_response_headers == response2.headers
+    assert response2.status == response2.splash_response_status == 200
 
 
 def test_magic_response():
@@ -233,7 +234,9 @@ def test_magic_response():
         b'X-My-Header': [b'foo'],
         b'Set-Cookie': [b'bar=baz'],
     }
+    assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
     assert resp2.status == 404
+    assert resp2.splash_response_status == 200
     assert resp2.url == "http://exmaple.com/#id42"
     assert len(resp2.cookiejar) == 3
     cookies = [c for c in resp2.cookiejar]
@@ -359,7 +362,8 @@ def test_magic_response2():
     assert resp2.data == resp_data
     assert resp2.body == b'binary data'
     assert resp2.headers == {b'Content-Type': [b'text/plain']}
-    assert resp2.status == 200
+    assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
+    assert resp2.status == resp2.splash_response_status == 200
     assert resp2.url == "http://example.com/"
 
 
@@ -397,12 +401,13 @@ def test_magic_response_http_error():
         "error": 400,
         "type": "ScriptError"
     }
-    resp = TextResponse("http://mysplash.example.com/execute",
+    resp = TextResponse("http://mysplash.example.com/execute", status=400,
                         headers={b'Content-Type': b'application/json'},
                         body=json.dumps(resp_data).encode('utf8'))
     resp = mw.process_response(req, resp, None)
     assert resp.data == resp_data
     assert resp.status == 404
+    assert resp.splash_response_status == 400
     assert resp.url == "http://example.com/foo"
 
 

diff --git a/tests/utils.py b/tests/utils.py
@@ -20,11 +20,13 @@ class HtmlResource(Resource):
     content_type = 'text/html'
     html = ''
     extra_headers = {}
+    status_code = 200
 
     def render_GET(self, request):
         request.setHeader(b'content-type', to_bytes(self.content_type))
         for name, value in self.extra_headers.items():
             request.setHeader(to_bytes(name), to_bytes(value))
+        request.setResponseCode(self.status_code)
         return to_bytes(self.html)