Skip to content
Permalink
Browse files Browse the repository at this point in the history
Add http_auth_domain to HttpAuthMiddleware.
  • Loading branch information
wRAR authored and Gallaecio committed Oct 5, 2021
1 parent 4183925 commit b01d69a
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 6 deletions.
18 changes: 16 additions & 2 deletions docs/topics/downloader-middleware.rst
Expand Up @@ -315,8 +315,21 @@ HttpAuthMiddleware
This middleware authenticates all requests generated from certain spiders
using `Basic access authentication`_ (aka. HTTP auth).

To enable HTTP authentication from certain spiders, set the ``http_user``
and ``http_pass`` attributes of those spiders.
To enable HTTP authentication for a spider, set the ``http_user`` and
``http_pass`` spider attributes to the authentication data and the
``http_auth_domain`` spider attribute to the domain which requires this
authentication (its subdomains will be also handled in the same way).
You can set ``http_auth_domain`` to ``None`` to enable the
authentication for all requests but usually this is not needed.

.. warning::
In the previous Scrapy versions HttpAuthMiddleware sent the
authentication data with all requests, which is a security problem if
the spider makes requests to several different domains. Currently if
the ``http_auth_domain`` attribute is not set, the middleware will use
the domain of the first request, which will work for some spider but
not for others. In the future the middleware will produce an error
instead.

Example::

Expand All @@ -326,6 +339,7 @@ HttpAuthMiddleware

http_user = 'someuser'
http_pass = 'somepass'
http_auth_domain = 'intranet.example.com'
name = 'intranet.example.com'

# .. rest of the spider code omitted ...
Expand Down
21 changes: 20 additions & 1 deletion scrapy/downloadermiddlewares/httpauth.py
Expand Up @@ -3,10 +3,14 @@
See documentation in docs/topics/downloader-middleware.rst
"""
import warnings

from w3lib.http import basic_auth_header

from scrapy import signals
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.url import url_is_from_any_domain


class HttpAuthMiddleware(object):
Expand All @@ -24,8 +28,23 @@ def spider_opened(self, spider):
pwd = getattr(spider, 'http_pass', '')
if usr or pwd:
self.auth = basic_auth_header(usr, pwd)
if not hasattr(spider, 'http_auth_domain'):
warnings.warn('Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security '
'problems if the spider makes requests to several different domains. http_auth_domain '
'will be set to the domain of the first request, please set it to the correct value '
'explicitly.',
category=ScrapyDeprecationWarning)
self.domain_unset = True
else:
self.domain = spider.http_auth_domain
self.domain_unset = False

def process_request(self, request, spider):
auth = getattr(self, 'auth', None)
if auth and b'Authorization' not in request.headers:
request.headers[b'Authorization'] = auth
domain = urlparse_cached(request).hostname
if self.domain_unset:
self.domain = domain
self.domain_unset = False
if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
request.headers[b'Authorization'] = auth
85 changes: 82 additions & 3 deletions tests/test_downloadermiddleware_httpauth.py
@@ -1,13 +1,60 @@
import unittest

from w3lib.http import basic_auth_header

from scrapy.http import Request
from scrapy.downloadermiddlewares.httpauth import HttpAuthMiddleware
from scrapy.spiders import Spider


class TestSpiderLegacy(Spider):
http_user = 'foo'
http_pass = 'bar'


class TestSpider(Spider):
http_user = 'foo'
http_pass = 'bar'
http_auth_domain = 'example.com'


class TestSpiderAny(Spider):
http_user = 'foo'
http_pass = 'bar'
http_auth_domain = None


class HttpAuthMiddlewareLegacyTest(unittest.TestCase):

def setUp(self):
self.spider = TestSpiderLegacy('foo')

def test_auth(self):
mw = HttpAuthMiddleware()
mw.spider_opened(self.spider)

# initial request, sets the domain and sends the header
req = Request('http://example.com/')
assert mw.process_request(req, self.spider) is None
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))

# subsequent request to the same domain, should send the header
req = Request('http://example.com/')
assert mw.process_request(req, self.spider) is None
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))

# subsequent request to a different domain, shouldn't send the header
req = Request('http://example-noauth.com/')
assert mw.process_request(req, self.spider) is None
self.assertNotIn('Authorization', req.headers)

def test_auth_already_set(self):
mw = HttpAuthMiddleware()
mw.spider_opened(self.spider)
req = Request('http://example.com/',
headers=dict(Authorization='Digest 123'))
assert mw.process_request(req, self.spider) is None
self.assertEqual(req.headers['Authorization'], b'Digest 123')


class HttpAuthMiddlewareTest(unittest.TestCase):
Expand All @@ -20,13 +67,45 @@ def setUp(self):
def tearDown(self):
del self.mw

def test_no_auth(self):
req = Request('http://example-noauth.com/')
assert self.mw.process_request(req, self.spider) is None
self.assertNotIn('Authorization', req.headers)

def test_auth_domain(self):
req = Request('http://example.com/')
assert self.mw.process_request(req, self.spider) is None
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))

def test_auth_subdomain(self):
req = Request('http://foo.example.com/')
assert self.mw.process_request(req, self.spider) is None
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))

def test_auth_already_set(self):
req = Request('http://example.com/',
headers=dict(Authorization='Digest 123'))
assert self.mw.process_request(req, self.spider) is None
self.assertEqual(req.headers['Authorization'], b'Digest 123')


class HttpAuthAnyMiddlewareTest(unittest.TestCase):

def setUp(self):
self.mw = HttpAuthMiddleware()
self.spider = TestSpiderAny('foo')
self.mw.spider_opened(self.spider)

def tearDown(self):
del self.mw

def test_auth(self):
req = Request('http://scrapytest.org/')
req = Request('http://example.com/')
assert self.mw.process_request(req, self.spider) is None
self.assertEqual(req.headers['Authorization'], b'Basic Zm9vOmJhcg==')
self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))

def test_auth_already_set(self):
req = Request('http://scrapytest.org/',
req = Request('http://example.com/',
headers=dict(Authorization='Digest 123'))
assert self.mw.process_request(req, self.spider) is None
self.assertEqual(req.headers['Authorization'], b'Digest 123')

0 comments on commit b01d69a

Please sign in to comment.