Skip to content

Commit 8ce01b3

Browse files
authored
Merge pull request from GHSA-cjvr-mfj7-j4j8
* Do not carry over cookies to a different domain on redirect * Cover the cookie-domain redirect fix in the release notes * Cover 1.8.2 in the release notes * Fix redirect Cookie handling when the cookie middleware is disabled * Update the 1.8.2 release date
1 parent aa0306a commit 8ce01b3

File tree

3 files changed

+247
-6
lines changed

3 files changed

+247
-6
lines changed

Diff for: docs/news.rst

+66-1
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@ Release notes
55

66
.. _release-2.6.0:
77

8-
Scrapy 2.6.0 (2022-02-??)
8+
Scrapy 2.6.0 (2022-03-01)
99
-------------------------
1010

1111
Highlights:
1212

13+
* :ref:`Security fixes for cookie handling <2.6-security-fixes>`
14+
1315
* Python 3.10 support
1416

1517
* :ref:`asyncio support <using-asyncio>` is no longer considered
@@ -20,6 +22,37 @@ Highlights:
2022
:ref:`item filtering <item-filter>` and
2123
:ref:`post-processing <post-processing>`
2224

25+
.. _2.6-security-fixes:
26+
27+
Security bug fixes
28+
~~~~~~~~~~~~~~~~~~
29+
30+
- When a :class:`~scrapy.http.Request` object with cookies defined gets a
31+
redirect response causing a new :class:`~scrapy.http.Request` object to be
32+
scheduled, the cookies defined in the original
33+
:class:`~scrapy.http.Request` object are no longer copied into the new
34+
:class:`~scrapy.http.Request` object.
35+
36+
If you manually set the ``Cookie`` header on a
37+
:class:`~scrapy.http.Request` object and the domain name of the redirect
38+
URL is not an exact match for the domain of the URL of the original
39+
:class:`~scrapy.http.Request` object, your ``Cookie`` header is now dropped
40+
from the new :class:`~scrapy.http.Request` object.
41+
42+
The old behavior could be exploited by an attacker to gain access to your
43+
cookies. Please, see the `cjvr-mfj7-j4j8 security advisory`_ for more
44+
information.
45+
46+
.. _cjvr-mfj7-j4j8 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cjvr-mfj7-j4j8
47+
48+
.. note:: It is still possible to enable the sharing of cookies between
49+
different domains with a shared domain suffix (e.g.
50+
``example.com`` and any subdomain) by defining the shared domain
51+
suffix (e.g. ``example.com``) as the cookie domain when defining
52+
your cookies. See the documentation of the
53+
:class:`~scrapy.http.Request` class for more information.
54+
55+
2356
Modified requirements
2457
~~~~~~~~~~~~~~~~~~~~~
2558

@@ -1842,6 +1875,38 @@ affect subclasses:
18421875

18431876
(:issue:`3884`)
18441877

1878+
.. _release-1.8.2:
1879+
1880+
Scrapy 1.8.2 (2022-03-01)
1881+
-------------------------
1882+
1883+
**Security bug fixes:**
1884+
1885+
- When a :class:`~scrapy.http.Request` object with cookies defined gets a
1886+
redirect response causing a new :class:`~scrapy.http.Request` object to be
1887+
scheduled, the cookies defined in the original
1888+
:class:`~scrapy.http.Request` object are no longer copied into the new
1889+
:class:`~scrapy.http.Request` object.
1890+
1891+
If you manually set the ``Cookie`` header on a
1892+
:class:`~scrapy.http.Request` object and the domain name of the redirect
1893+
URL is not an exact match for the domain of the URL of the original
1894+
:class:`~scrapy.http.Request` object, your ``Cookie`` header is now dropped
1895+
from the new :class:`~scrapy.http.Request` object.
1896+
1897+
The old behavior could be exploited by an attacker to gain access to your
1898+
cookies. Please, see the `cjvr-mfj7-j4j8 security advisory`_ for more
1899+
information.
1900+
1901+
.. _cjvr-mfj7-j4j8 security advisory: https://github.com/scrapy/scrapy/security/advisories/GHSA-cjvr-mfj7-j4j8
1902+
1903+
.. note:: It is still possible to enable the sharing of cookies between
1904+
different domains with a shared domain suffix (e.g.
1905+
``example.com`` and any subdomain) by defining the shared domain
1906+
suffix (e.g. ``example.com``) as the cookie domain when defining
1907+
your cookies. See the documentation of the
1908+
:class:`~scrapy.http.Request` class for more information.
1909+
18451910

18461911
.. _release-1.8.1:
18471912

Diff for: scrapy/downloadermiddlewares/redirect.py

+26-5
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,29 @@
44
from w3lib.url import safe_url_string
55

66
from scrapy.http import HtmlResponse
7+
from scrapy.utils.httpobj import urlparse_cached
78
from scrapy.utils.response import get_meta_refresh
89
from scrapy.exceptions import IgnoreRequest, NotConfigured
910

1011

1112
logger = logging.getLogger(__name__)
1213

1314

15+
def _build_redirect_request(source_request, *, url, method=None, body=None):
16+
redirect_request = source_request.replace(
17+
url=url,
18+
method=method,
19+
body=body,
20+
cookies=None,
21+
)
22+
if 'Cookie' in redirect_request.headers:
23+
source_request_netloc = urlparse_cached(source_request).netloc
24+
redirect_request_netloc = urlparse_cached(redirect_request).netloc
25+
if source_request_netloc != redirect_request_netloc:
26+
del redirect_request.headers['Cookie']
27+
return redirect_request
28+
29+
1430
class BaseRedirectMiddleware:
1531

1632
enabled_setting = 'REDIRECT_ENABLED'
@@ -47,10 +63,15 @@ def _redirect(self, redirected, request, spider, reason):
4763
raise IgnoreRequest("max redirections reached")
4864

4965
def _redirect_request_using_get(self, request, redirect_url):
50-
redirected = request.replace(url=redirect_url, method='GET', body='')
51-
redirected.headers.pop('Content-Type', None)
52-
redirected.headers.pop('Content-Length', None)
53-
return redirected
66+
redirect_request = _build_redirect_request(
67+
request,
68+
url=redirect_url,
69+
method='GET',
70+
body='',
71+
)
72+
redirect_request.headers.pop('Content-Type', None)
73+
redirect_request.headers.pop('Content-Length', None)
74+
return redirect_request
5475

5576

5677
class RedirectMiddleware(BaseRedirectMiddleware):
@@ -80,7 +101,7 @@ def process_response(self, request, response, spider):
80101
redirected_url = urljoin(request.url, location)
81102

82103
if response.status in (301, 307, 308) or request.method == 'HEAD':
83-
redirected = request.replace(url=redirected_url)
104+
redirected = _build_redirect_request(request, url=redirected_url)
84105
return self._redirect(redirected, request, spider, response.status)
85106

86107
redirected = self._redirect_request_using_get(request, redirected_url)

Diff for: tests/test_downloadermiddleware_cookies.py

+155
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66

77
from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
88
from scrapy.downloadermiddlewares.defaultheaders import DefaultHeadersMiddleware
9+
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
910
from scrapy.exceptions import NotConfigured
1011
from scrapy.http import Response, Request
12+
from scrapy.settings import Settings
1113
from scrapy.spiders import Spider
1214
from scrapy.utils.python import to_bytes
1315
from scrapy.utils.test import get_crawler
@@ -23,9 +25,11 @@ def split_cookies(cookies):
2325
def setUp(self):
2426
self.spider = Spider('foo')
2527
self.mw = CookiesMiddleware()
28+
self.redirect_middleware = RedirectMiddleware(settings=Settings())
2629

2730
def tearDown(self):
2831
del self.mw
32+
del self.redirect_middleware
2933

3034
def test_basic(self):
3135
req = Request('http://scrapytest.org/')
@@ -368,3 +372,154 @@ def test_primitive_type_cookies(self):
368372
req4 = Request('http://example.org', cookies={'a': 'b'})
369373
assert self.mw.process_request(req4, self.spider) is None
370374
self.assertCookieValEqual(req4.headers['Cookie'], b'a=b')
375+
376+
def _test_cookie_redirect(
377+
self,
378+
source,
379+
target,
380+
*,
381+
cookies1,
382+
cookies2,
383+
):
384+
input_cookies = {'a': 'b'}
385+
386+
if not isinstance(source, dict):
387+
source = {'url': source}
388+
if not isinstance(target, dict):
389+
target = {'url': target}
390+
target.setdefault('status', 301)
391+
392+
request1 = Request(cookies=input_cookies, **source)
393+
self.mw.process_request(request1, self.spider)
394+
cookies = request1.headers.get('Cookie')
395+
self.assertEqual(cookies, b"a=b" if cookies1 else None)
396+
397+
response = Response(
398+
headers={
399+
'Location': target['url'],
400+
},
401+
**target,
402+
)
403+
self.assertEqual(
404+
self.mw.process_response(request1, response, self.spider),
405+
response,
406+
)
407+
408+
request2 = self.redirect_middleware.process_response(
409+
request1,
410+
response,
411+
self.spider,
412+
)
413+
self.assertIsInstance(request2, Request)
414+
415+
self.mw.process_request(request2, self.spider)
416+
cookies = request2.headers.get('Cookie')
417+
self.assertEqual(cookies, b"a=b" if cookies2 else None)
418+
419+
def test_cookie_redirect_same_domain(self):
420+
self._test_cookie_redirect(
421+
'https://toscrape.com',
422+
'https://toscrape.com',
423+
cookies1=True,
424+
cookies2=True,
425+
)
426+
427+
def test_cookie_redirect_same_domain_forcing_get(self):
428+
self._test_cookie_redirect(
429+
'https://toscrape.com',
430+
{'url': 'https://toscrape.com', 'status': 302},
431+
cookies1=True,
432+
cookies2=True,
433+
)
434+
435+
def test_cookie_redirect_different_domain(self):
436+
self._test_cookie_redirect(
437+
'https://toscrape.com',
438+
'https://example.com',
439+
cookies1=True,
440+
cookies2=False,
441+
)
442+
443+
def test_cookie_redirect_different_domain_forcing_get(self):
444+
self._test_cookie_redirect(
445+
'https://toscrape.com',
446+
{'url': 'https://example.com', 'status': 302},
447+
cookies1=True,
448+
cookies2=False,
449+
)
450+
451+
def _test_cookie_header_redirect(
452+
self,
453+
source,
454+
target,
455+
*,
456+
cookies2,
457+
):
458+
"""Test the handling of a user-defined Cookie header when building a
459+
redirect follow-up request.
460+
461+
We follow RFC 6265 for cookie handling. The Cookie header can only
462+
contain a list of key-value pairs (i.e. no additional cookie
463+
parameters like Domain or Path). Because of that, we follow the same
464+
rules that we would follow for the handling of the Set-Cookie response
465+
header when the Domain is not set: the cookies must be limited to the
466+
target URL domain (not even subdomains can receive those cookies).
467+
468+
.. note:: This method tests the scenario where the cookie middleware is
469+
disabled. Because of known issue #1992, when the cookies
470+
middleware is enabled we do not need to be concerned about
471+
the Cookie header getting leaked to unintended domains,
472+
because the middleware empties the header from every request.
473+
"""
474+
if not isinstance(source, dict):
475+
source = {'url': source}
476+
if not isinstance(target, dict):
477+
target = {'url': target}
478+
target.setdefault('status', 301)
479+
480+
request1 = Request(headers={'Cookie': b'a=b'}, **source)
481+
482+
response = Response(
483+
headers={
484+
'Location': target['url'],
485+
},
486+
**target,
487+
)
488+
489+
request2 = self.redirect_middleware.process_response(
490+
request1,
491+
response,
492+
self.spider,
493+
)
494+
self.assertIsInstance(request2, Request)
495+
496+
cookies = request2.headers.get('Cookie')
497+
self.assertEqual(cookies, b"a=b" if cookies2 else None)
498+
499+
def test_cookie_header_redirect_same_domain(self):
500+
self._test_cookie_header_redirect(
501+
'https://toscrape.com',
502+
'https://toscrape.com',
503+
cookies2=True,
504+
)
505+
506+
def test_cookie_header_redirect_same_domain_forcing_get(self):
507+
self._test_cookie_header_redirect(
508+
'https://toscrape.com',
509+
{'url': 'https://toscrape.com', 'status': 302},
510+
cookies2=True,
511+
)
512+
513+
def test_cookie_header_redirect_different_domain(self):
514+
self._test_cookie_header_redirect(
515+
'https://toscrape.com',
516+
'https://example.com',
517+
cookies2=False,
518+
)
519+
520+
def test_cookie_header_redirect_different_domain_forcing_get(self):
521+
self._test_cookie_header_redirect(
522+
'https://toscrape.com',
523+
{'url': 'https://example.com', 'status': 302},
524+
cookies2=False,
525+
)

0 commit comments

Comments
 (0)