66
77from scrapy .downloadermiddlewares .cookies import CookiesMiddleware
88from scrapy .downloadermiddlewares .defaultheaders import DefaultHeadersMiddleware
9+ from scrapy .downloadermiddlewares .redirect import RedirectMiddleware
910from scrapy .exceptions import NotConfigured
1011from scrapy .http import Response , Request
12+ from scrapy .settings import Settings
1113from scrapy .spiders import Spider
1214from scrapy .utils .python import to_bytes
1315from scrapy .utils .test import get_crawler
@@ -23,9 +25,11 @@ def split_cookies(cookies):
2325 def setUp (self ):
2426 self .spider = Spider ('foo' )
2527 self .mw = CookiesMiddleware ()
28+ self .redirect_middleware = RedirectMiddleware (settings = Settings ())
2629
2730 def tearDown (self ):
2831 del self .mw
32+ del self .redirect_middleware
2933
3034 def test_basic (self ):
3135 req = Request ('http://scrapytest.org/' )
@@ -368,3 +372,154 @@ def test_primitive_type_cookies(self):
368372 req4 = Request ('http://example.org' , cookies = {'a' : 'b' })
369373 assert self .mw .process_request (req4 , self .spider ) is None
370374 self .assertCookieValEqual (req4 .headers ['Cookie' ], b'a=b' )
375+
376+ def _test_cookie_redirect (
377+ self ,
378+ source ,
379+ target ,
380+ * ,
381+ cookies1 ,
382+ cookies2 ,
383+ ):
384+ input_cookies = {'a' : 'b' }
385+
386+ if not isinstance (source , dict ):
387+ source = {'url' : source }
388+ if not isinstance (target , dict ):
389+ target = {'url' : target }
390+ target .setdefault ('status' , 301 )
391+
392+ request1 = Request (cookies = input_cookies , ** source )
393+ self .mw .process_request (request1 , self .spider )
394+ cookies = request1 .headers .get ('Cookie' )
395+ self .assertEqual (cookies , b"a=b" if cookies1 else None )
396+
397+ response = Response (
398+ headers = {
399+ 'Location' : target ['url' ],
400+ },
401+ ** target ,
402+ )
403+ self .assertEqual (
404+ self .mw .process_response (request1 , response , self .spider ),
405+ response ,
406+ )
407+
408+ request2 = self .redirect_middleware .process_response (
409+ request1 ,
410+ response ,
411+ self .spider ,
412+ )
413+ self .assertIsInstance (request2 , Request )
414+
415+ self .mw .process_request (request2 , self .spider )
416+ cookies = request2 .headers .get ('Cookie' )
417+ self .assertEqual (cookies , b"a=b" if cookies2 else None )
418+
419+ def test_cookie_redirect_same_domain (self ):
420+ self ._test_cookie_redirect (
421+ 'https://toscrape.com' ,
422+ 'https://toscrape.com' ,
423+ cookies1 = True ,
424+ cookies2 = True ,
425+ )
426+
427+ def test_cookie_redirect_same_domain_forcing_get (self ):
428+ self ._test_cookie_redirect (
429+ 'https://toscrape.com' ,
430+ {'url' : 'https://toscrape.com' , 'status' : 302 },
431+ cookies1 = True ,
432+ cookies2 = True ,
433+ )
434+
435+ def test_cookie_redirect_different_domain (self ):
436+ self ._test_cookie_redirect (
437+ 'https://toscrape.com' ,
438+ 'https://example.com' ,
439+ cookies1 = True ,
440+ cookies2 = False ,
441+ )
442+
443+ def test_cookie_redirect_different_domain_forcing_get (self ):
444+ self ._test_cookie_redirect (
445+ 'https://toscrape.com' ,
446+ {'url' : 'https://example.com' , 'status' : 302 },
447+ cookies1 = True ,
448+ cookies2 = False ,
449+ )
450+
451+ def _test_cookie_header_redirect (
452+ self ,
453+ source ,
454+ target ,
455+ * ,
456+ cookies2 ,
457+ ):
458+ """Test the handling of a user-defined Cookie header when building a
459+ redirect follow-up request.
460+
461+ We follow RFC 6265 for cookie handling. The Cookie header can only
462+ contain a list of key-value pairs (i.e. no additional cookie
463+ parameters like Domain or Path). Because of that, we follow the same
464+ rules that we would follow for the handling of the Set-Cookie response
465+ header when the Domain is not set: the cookies must be limited to the
466+ target URL domain (not even subdomains can receive those cookies).
467+
468+ .. note:: This method tests the scenario where the cookie middleware is
469+ disabled. Because of known issue #1992, when the cookies
470+ middleware is enabled we do not need to be concerned about
471+ the Cookie header getting leaked to unintended domains,
472+ because the middleware empties the header from every request.
473+ """
474+ if not isinstance (source , dict ):
475+ source = {'url' : source }
476+ if not isinstance (target , dict ):
477+ target = {'url' : target }
478+ target .setdefault ('status' , 301 )
479+
480+ request1 = Request (headers = {'Cookie' : b'a=b' }, ** source )
481+
482+ response = Response (
483+ headers = {
484+ 'Location' : target ['url' ],
485+ },
486+ ** target ,
487+ )
488+
489+ request2 = self .redirect_middleware .process_response (
490+ request1 ,
491+ response ,
492+ self .spider ,
493+ )
494+ self .assertIsInstance (request2 , Request )
495+
496+ cookies = request2 .headers .get ('Cookie' )
497+ self .assertEqual (cookies , b"a=b" if cookies2 else None )
498+
499+ def test_cookie_header_redirect_same_domain (self ):
500+ self ._test_cookie_header_redirect (
501+ 'https://toscrape.com' ,
502+ 'https://toscrape.com' ,
503+ cookies2 = True ,
504+ )
505+
506+ def test_cookie_header_redirect_same_domain_forcing_get (self ):
507+ self ._test_cookie_header_redirect (
508+ 'https://toscrape.com' ,
509+ {'url' : 'https://toscrape.com' , 'status' : 302 },
510+ cookies2 = True ,
511+ )
512+
513+ def test_cookie_header_redirect_different_domain (self ):
514+ self ._test_cookie_header_redirect (
515+ 'https://toscrape.com' ,
516+ 'https://example.com' ,
517+ cookies2 = False ,
518+ )
519+
520+ def test_cookie_header_redirect_different_domain_forcing_get (self ):
521+ self ._test_cookie_header_redirect (
522+ 'https://toscrape.com' ,
523+ {'url' : 'https://example.com' , 'status' : 302 },
524+ cookies2 = False ,
525+ )
0 commit comments