Skip to content

Commit

Permalink
Merge pull request #4803 from elacuesta/instantiate-resolution-receiver
Browse files Browse the repository at this point in the history
CachingHostnameResolver
  • Loading branch information
wRAR committed Oct 20, 2020
2 parents c340e72 + 585e4a8 commit 75f35f5
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 47 deletions.
67 changes: 43 additions & 24 deletions scrapy/resolver.py
@@ -1,6 +1,6 @@
from twisted.internet import defer
from twisted.internet.base import ThreadedResolver
from twisted.internet.interfaces import IHostnameResolver, IResolutionReceiver, IResolverSimple
from twisted.internet.interfaces import IHostResolution, IHostnameResolver, IResolutionReceiver, IResolverSimple
from zope.interface.declarations import implementer, provider

from scrapy.utils.datatypes import LocalCache
Expand Down Expand Up @@ -50,6 +50,36 @@ def _cache_result(self, result, name):
return result


@implementer(IHostResolution)
class HostResolution:
def __init__(self, name):
self.name = name

def cancel(self):
raise NotImplementedError()


@provider(IResolutionReceiver)
class _CachingResolutionReceiver:
def __init__(self, resolutionReceiver, hostName):
self.resolutionReceiver = resolutionReceiver
self.hostName = hostName
self.addresses = []

def resolutionBegan(self, resolution):
self.resolutionReceiver.resolutionBegan(resolution)
self.resolution = resolution

def addressResolved(self, address):
self.resolutionReceiver.addressResolved(address)
self.addresses.append(address)

def resolutionComplete(self):
self.resolutionReceiver.resolutionComplete()
if self.addresses:
dnscache[self.hostName] = self.addresses


@implementer(IHostnameResolver)
class CachingHostnameResolver:
"""
Expand All @@ -73,33 +103,22 @@ def from_crawler(cls, crawler, reactor):
def install_on_reactor(self):
self.reactor.installNameResolver(self)

def resolveHostName(self, resolutionReceiver, hostName, portNumber=0,
addressTypes=None, transportSemantics='TCP'):

@provider(IResolutionReceiver)
class CachingResolutionReceiver(resolutionReceiver):

def resolutionBegan(self, resolution):
super().resolutionBegan(resolution)
self.resolution = resolution
self.resolved = False

def addressResolved(self, address):
super().addressResolved(address)
self.resolved = True

def resolutionComplete(self):
super().resolutionComplete()
if self.resolved:
dnscache[hostName] = self.resolution

def resolveHostName(
self, resolutionReceiver, hostName, portNumber=0, addressTypes=None, transportSemantics="TCP"
):
try:
return dnscache[hostName]
addresses = dnscache[hostName]
except KeyError:
return self.original_resolver.resolveHostName(
CachingResolutionReceiver(),
_CachingResolutionReceiver(resolutionReceiver, hostName),
hostName,
portNumber,
addressTypes,
transportSemantics
transportSemantics,
)
else:
resolutionReceiver.resolutionBegan(HostResolution(hostName))
for addr in addresses:
resolutionReceiver.addressResolved(addr)
resolutionReceiver.resolutionComplete()
return resolutionReceiver
15 changes: 0 additions & 15 deletions tests/CrawlerProcess/alternative_name_resolver.py

This file was deleted.

30 changes: 30 additions & 0 deletions tests/CrawlerProcess/caching_hostname_resolver.py
@@ -0,0 +1,30 @@
import sys

import scrapy
from scrapy.crawler import CrawlerProcess


class CachingHostnameResolverSpider(scrapy.Spider):
"""
Finishes in a finite amount of time (does not hang indefinitely in the DNS resolution)
"""
name = "caching_hostname_resolver_spider"

def start_requests(self):
yield scrapy.Request(self.url)

def parse(self, response):
for _ in range(10):
yield scrapy.Request(response.url, dont_filter=True, callback=self.ignore_response)

def ignore_response(self, response):
self.logger.info(repr(response.ip_address))


if __name__ == "__main__":
process = CrawlerProcess(settings={
"RETRY_ENABLED": False,
"DNS_RESOLVER": "scrapy.resolver.CachingHostnameResolver",
})
process.crawl(CachingHostnameResolverSpider, url=sys.argv[1])
process.start()
19 changes: 19 additions & 0 deletions tests/CrawlerProcess/caching_hostname_resolver_ipv6.py
@@ -0,0 +1,19 @@
import scrapy
from scrapy.crawler import CrawlerProcess


class CachingHostnameResolverSpider(scrapy.Spider):
"""
Finishes without a twisted.internet.error.DNSLookupError exception
"""
name = "caching_hostname_resolver_spider"
start_urls = ["http://[::1]"]


if __name__ == "__main__":
process = CrawlerProcess(settings={
"RETRY_ENABLED": False,
"DNS_RESOLVER": "scrapy.resolver.CachingHostnameResolver",
})
process.crawl(CachingHostnameResolverSpider)
process.start()
11 changes: 8 additions & 3 deletions tests/CrawlerProcess/default_name_resolver.py
Expand Up @@ -3,10 +3,15 @@


class IPv6Spider(scrapy.Spider):
"""
Raises a twisted.internet.error.DNSLookupError:
the default name resolver does not handle IPv6 addresses.
"""
name = "ipv6_spider"
start_urls = ["http://[::1]"]


process = CrawlerProcess(settings={"RETRY_ENABLED": False})
process.crawl(IPv6Spider)
process.start()
if __name__ == "__main__":
process = CrawlerProcess(settings={"RETRY_ENABLED": False})
process.crawl(IPv6Spider)
process.start()
21 changes: 16 additions & 5 deletions tests/test_crawler.py
Expand Up @@ -22,6 +22,8 @@
from scrapy.extensions import telnet
from scrapy.utils.test import get_testenv

from tests.mockserver import MockServer


class BaseCrawlerTest(unittest.TestCase):

Expand Down Expand Up @@ -280,9 +282,9 @@ def test_crawler_process_asyncio_enabled_false(self):


class ScriptRunnerMixin:
def run_script(self, script_name):
def run_script(self, script_name, *script_args):
script_path = os.path.join(self.script_dir, script_name)
args = (sys.executable, script_path)
args = [sys.executable, script_path] + list(script_args)
p = subprocess.Popen(args, env=get_testenv(),
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
Expand Down Expand Up @@ -321,11 +323,20 @@ def test_ipv6_default_name_resolver(self):
"twisted.internet.error.DNSLookupError: DNS lookup failed: no results for hostname lookup: ::1.",
log)

def test_ipv6_alternative_name_resolver(self):
log = self.run_script('alternative_name_resolver.py')
self.assertIn('Spider closed (finished)', log)
def test_caching_hostname_resolver_ipv6(self):
log = self.run_script("caching_hostname_resolver_ipv6.py")
self.assertIn("Spider closed (finished)", log)
self.assertNotIn("twisted.internet.error.DNSLookupError", log)

def test_caching_hostname_resolver_finite_execution(self):
with MockServer() as mock_server:
http_address = mock_server.http_address.replace("0.0.0.0", "127.0.0.1")
log = self.run_script("caching_hostname_resolver.py", http_address)
self.assertIn("Spider closed (finished)", log)
self.assertNotIn("ERROR: Error downloading", log)
self.assertNotIn("TimeoutError", log)
self.assertNotIn("twisted.internet.error.DNSLookupError", log)

def test_reactor_select(self):
log = self.run_script("twisted_reactor_select.py")
self.assertIn("Spider closed (finished)", log)
Expand Down

0 comments on commit 75f35f5

Please sign in to comment.