Skip to content

Commit b01d69a

Browse files
wRARGallaecio
authored andcommitted
Add http_auth_domain to HttpAuthMiddleware.
1 parent 4183925 commit b01d69a

File tree

3 files changed

+118
-6
lines changed

3 files changed

+118
-6
lines changed

Diff for: docs/topics/downloader-middleware.rst

+16-2
Original file line numberDiff line numberDiff line change
@@ -315,8 +315,21 @@ HttpAuthMiddleware
315315
This middleware authenticates all requests generated from certain spiders
316316
using `Basic access authentication`_ (aka. HTTP auth).
317317

318-
To enable HTTP authentication from certain spiders, set the ``http_user``
319-
and ``http_pass`` attributes of those spiders.
318+
To enable HTTP authentication for a spider, set the ``http_user`` and
319+
``http_pass`` spider attributes to the authentication data and the
320+
``http_auth_domain`` spider attribute to the domain which requires this
321+
authentication (its subdomains will be also handled in the same way).
322+
You can set ``http_auth_domain`` to ``None`` to enable the
323+
authentication for all requests but usually this is not needed.
324+
325+
.. warning::
326+
In the previous Scrapy versions HttpAuthMiddleware sent the
327+
authentication data with all requests, which is a security problem if
328+
the spider makes requests to several different domains. Currently if
329+
the ``http_auth_domain`` attribute is not set, the middleware will use
330+
the domain of the first request, which will work for some spider but
331+
not for others. In the future the middleware will produce an error
332+
instead.
320333

321334
Example::
322335

@@ -326,6 +339,7 @@ HttpAuthMiddleware
326339

327340
http_user = 'someuser'
328341
http_pass = 'somepass'
342+
http_auth_domain = 'intranet.example.com'
329343
name = 'intranet.example.com'
330344

331345
# .. rest of the spider code omitted ...

Diff for: scrapy/downloadermiddlewares/httpauth.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33
44
See documentation in docs/topics/downloader-middleware.rst
55
"""
6+
import warnings
67

78
from w3lib.http import basic_auth_header
89

910
from scrapy import signals
11+
from scrapy.exceptions import ScrapyDeprecationWarning
12+
from scrapy.utils.httpobj import urlparse_cached
13+
from scrapy.utils.url import url_is_from_any_domain
1014

1115

1216
class HttpAuthMiddleware(object):
@@ -24,8 +28,23 @@ def spider_opened(self, spider):
2428
pwd = getattr(spider, 'http_pass', '')
2529
if usr or pwd:
2630
self.auth = basic_auth_header(usr, pwd)
31+
if not hasattr(spider, 'http_auth_domain'):
32+
warnings.warn('Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security '
33+
'problems if the spider makes requests to several different domains. http_auth_domain '
34+
'will be set to the domain of the first request, please set it to the correct value '
35+
'explicitly.',
36+
category=ScrapyDeprecationWarning)
37+
self.domain_unset = True
38+
else:
39+
self.domain = spider.http_auth_domain
40+
self.domain_unset = False
2741

2842
def process_request(self, request, spider):
2943
auth = getattr(self, 'auth', None)
3044
if auth and b'Authorization' not in request.headers:
31-
request.headers[b'Authorization'] = auth
45+
domain = urlparse_cached(request).hostname
46+
if self.domain_unset:
47+
self.domain = domain
48+
self.domain_unset = False
49+
if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
50+
request.headers[b'Authorization'] = auth

Diff for: tests/test_downloadermiddleware_httpauth.py

+82-3
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,60 @@
11
import unittest
22

3+
from w3lib.http import basic_auth_header
4+
35
from scrapy.http import Request
46
from scrapy.downloadermiddlewares.httpauth import HttpAuthMiddleware
57
from scrapy.spiders import Spider
68

79

10+
class TestSpiderLegacy(Spider):
11+
http_user = 'foo'
12+
http_pass = 'bar'
13+
14+
815
class TestSpider(Spider):
916
http_user = 'foo'
1017
http_pass = 'bar'
18+
http_auth_domain = 'example.com'
19+
20+
21+
class TestSpiderAny(Spider):
22+
http_user = 'foo'
23+
http_pass = 'bar'
24+
http_auth_domain = None
25+
26+
27+
class HttpAuthMiddlewareLegacyTest(unittest.TestCase):
28+
29+
def setUp(self):
30+
self.spider = TestSpiderLegacy('foo')
31+
32+
def test_auth(self):
33+
mw = HttpAuthMiddleware()
34+
mw.spider_opened(self.spider)
35+
36+
# initial request, sets the domain and sends the header
37+
req = Request('http://example.com/')
38+
assert mw.process_request(req, self.spider) is None
39+
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
40+
41+
# subsequent request to the same domain, should send the header
42+
req = Request('http://example.com/')
43+
assert mw.process_request(req, self.spider) is None
44+
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
45+
46+
# subsequent request to a different domain, shouldn't send the header
47+
req = Request('http://example-noauth.com/')
48+
assert mw.process_request(req, self.spider) is None
49+
self.assertNotIn('Authorization', req.headers)
50+
51+
def test_auth_already_set(self):
52+
mw = HttpAuthMiddleware()
53+
mw.spider_opened(self.spider)
54+
req = Request('http://example.com/',
55+
headers=dict(Authorization='Digest 123'))
56+
assert mw.process_request(req, self.spider) is None
57+
self.assertEqual(req.headers['Authorization'], b'Digest 123')
1158

1259

1360
class HttpAuthMiddlewareTest(unittest.TestCase):
@@ -20,13 +67,45 @@ def setUp(self):
2067
def tearDown(self):
2168
del self.mw
2269

70+
def test_no_auth(self):
71+
req = Request('http://example-noauth.com/')
72+
assert self.mw.process_request(req, self.spider) is None
73+
self.assertNotIn('Authorization', req.headers)
74+
75+
def test_auth_domain(self):
76+
req = Request('http://example.com/')
77+
assert self.mw.process_request(req, self.spider) is None
78+
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
79+
80+
def test_auth_subdomain(self):
81+
req = Request('http://foo.example.com/')
82+
assert self.mw.process_request(req, self.spider) is None
83+
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
84+
85+
def test_auth_already_set(self):
86+
req = Request('http://example.com/',
87+
headers=dict(Authorization='Digest 123'))
88+
assert self.mw.process_request(req, self.spider) is None
89+
self.assertEqual(req.headers['Authorization'], b'Digest 123')
90+
91+
92+
class HttpAuthAnyMiddlewareTest(unittest.TestCase):
93+
94+
def setUp(self):
95+
self.mw = HttpAuthMiddleware()
96+
self.spider = TestSpiderAny('foo')
97+
self.mw.spider_opened(self.spider)
98+
99+
def tearDown(self):
100+
del self.mw
101+
23102
def test_auth(self):
24-
req = Request('http://scrapytest.org/')
103+
req = Request('http://example.com/')
25104
assert self.mw.process_request(req, self.spider) is None
26-
self.assertEqual(req.headers['Authorization'], b'Basic Zm9vOmJhcg==')
105+
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
27106

28107
def test_auth_already_set(self):
29-
req = Request('http://scrapytest.org/',
108+
req = Request('http://example.com/',
30109
headers=dict(Authorization='Digest 123'))
31110
assert self.mw.process_request(req, self.spider) is None
32111
self.assertEqual(req.headers['Authorization'], b'Digest 123')

0 commit comments

Comments
 (0)