scrapy · Gallaecio · Mar 18, 2021 · May 31, 2020 · Jun 2, 2020 · Jun 7, 2020
diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst
@@ -666,6 +666,20 @@ handler (without replacement), place this in your ``settings.py``::
         'ftp': None,
     }
 
+The default HTTPS handler uses HTTP/1.1. To use HTTP/2 update
+:setting:`DOWNLOAD_HANDLERS` as follows::
+
+    DOWNLOAD_HANDLERS = {
+        'https': 'scrapy.core.downloader.handlers.http2.H2DownloadHandler',
+    }
+
+.. note::
+
+    Scrapy currently does not support HTTP/2 Cleartext (h2c) since none
+    of the major browsers support HTTP/2 unencrypted (refer `http2 faq`_).
+
+.. _http2 faq: https://http2.github.io/faq/#does-http2-require-encryption
+
 .. setting:: DOWNLOAD_TIMEOUT
 
 DOWNLOAD_TIMEOUT
@@ -743,6 +757,15 @@ Optionally, this can be set per-request basis by using the
   If :setting:`RETRY_ENABLED` is ``True`` and this setting is set to ``True``,
   the ``ResponseFailed([_DataLoss])`` failure will be retried as usual.
 
+.. warning::
+
+    This setting is ignored by the
+    :class:`~scrapy.core.downloader.handlers.http2.H2DownloadHandler`
+    download handler (see :setting:`DOWNLOAD_HANDLERS`). In case of a data loss
+    error, the corresponding HTTP/2 connection may be corrupted, affecting other
+    requests that use the same connection; hence, a ``ResponseFailed([InvalidBodyLengthError])``
+    failure is always raised for every request that was using that connection.
+
 .. setting:: DUPEFILTER_CLASS
 
 DUPEFILTER_CLASS

diff --git a/scrapy/core/downloader/contextfactory.py b/scrapy/core/downloader/contextfactory.py
@@ -1,10 +1,15 @@
+import warnings
+
 from OpenSSL import SSL
+from twisted.internet._sslverify import _setAcceptableProtocols
 from twisted.internet.ssl import optionsForClientTLS, CertificateOptions, platformTrust, AcceptableCiphers
 from twisted.web.client import BrowserLikePolicyForHTTPS
 from twisted.web.iweb import IPolicyForHTTPS
 from zope.interface.declarations import implementer
+from zope.interface.verify import verifyObject
 
-from scrapy.core.downloader.tls import ScrapyClientTLSOptions, DEFAULT_CIPHERS
+from scrapy.core.downloader.tls import DEFAULT_CIPHERS, openssl_methods, ScrapyClientTLSOptions
+from scrapy.utils.misc import create_instance, load_object
 
 
 @implementer(IPolicyForHTTPS)
@@ -81,8 +86,8 @@ class BrowserLikeContextFactory(ScrapyClientContextFactory):
     The default OpenSSL method is ``TLS_METHOD`` (also called
     ``SSLv23_METHOD``) which allows TLS protocol negotiation.
     """
-    def creatorForNetloc(self, hostname, port):
 
+    def creatorForNetloc(self, hostname, port):
         # trustRoot set to platformTrust() will use the platform's root CAs.
         #
         # This means that a website like https://www.cacert.org will be rejected
@@ -92,3 +97,49 @@ def creatorForNetloc(self, hostname, port):
             trustRoot=platformTrust(),
             extraCertificateOptions={'method': self._ssl_method},
         )
+
+
+@implementer(IPolicyForHTTPS)
+class AcceptableProtocolsContextFactory:
+    """Context factory to used to override the acceptable protocols
+    to set up the [OpenSSL.SSL.Context] for doing NPN and/or ALPN
+    negotiation.
+    """
+
+    def __init__(self, context_factory, acceptable_protocols):
+        verifyObject(IPolicyForHTTPS, context_factory)
+        self._wrapped_context_factory = context_factory
+        self._acceptable_protocols = acceptable_protocols
+
+    def creatorForNetloc(self, hostname, port):
+        options = self._wrapped_context_factory.creatorForNetloc(hostname, port)
+        _setAcceptableProtocols(options._ctx, self._acceptable_protocols)
+        return options
+
+
+def load_context_factory_from_settings(settings, crawler):
+    ssl_method = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
+    context_factory_cls = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
+    # try method-aware context factory
+    try:
+        context_factory = create_instance(
+            objcls=context_factory_cls,
+            settings=settings,
+            crawler=crawler,
+            method=ssl_method,
+        )
+    except TypeError:
+        # use context factory defaults
+        context_factory = create_instance(
+            objcls=context_factory_cls,
+            settings=settings,
+            crawler=crawler,
+        )
+        msg = """
+            '%s' does not accept `method` argument (type OpenSSL.SSL method,\
+            e.g. OpenSSL.SSL.SSLv23_METHOD) and/or `tls_verbose_logging` argument and/or `tls_ciphers` argument.\
+            Please upgrade your context factory class to handle them or ignore them.""" % (
+            settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],)
+        warnings.warn(msg)
+
+    return context_factory
diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py
@@ -20,12 +20,11 @@
 from zope.interface import implementer
 
 from scrapy import signals
-from scrapy.core.downloader.tls import openssl_methods
+from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
 from scrapy.core.downloader.webclient import _parse
 from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
 from scrapy.http import Headers
 from scrapy.responsetypes import responsetypes
-from scrapy.utils.misc import create_instance, load_object
 from scrapy.utils.python import to_bytes, to_unicode
 
 
@@ -43,29 +42,7 @@ def __init__(self, settings, crawler=None):
         self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
         self._pool._factory.noisy = False
 
-        self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
-        self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
-        # try method-aware context factory
-        try:
-            self._contextFactory = create_instance(
-                objcls=self._contextFactoryClass,
-                settings=settings,
-                crawler=crawler,
-                method=self._sslMethod,
-            )
-        except TypeError:
-            # use context factory defaults
-            self._contextFactory = create_instance(
-                objcls=self._contextFactoryClass,
-                settings=settings,
-                crawler=crawler,
-            )
-            msg = f"""
- '{settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]}' does not accept `method` \
- argument (type OpenSSL.SSL method, e.g. OpenSSL.SSL.SSLv23_METHOD) and/or \
- `tls_verbose_logging` argument and/or `tls_ciphers` argument.\
- Please upgrade your context factory class to handle them or ignore them."""
-            warnings.warn(msg)
+        self._contextFactory = load_context_factory_from_settings(settings, crawler)
         self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
         self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
         self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')

diff --git a/scrapy/core/downloader/handlers/http2.py b/scrapy/core/downloader/handlers/http2.py
@@ -0,0 +1,119 @@
+import warnings
+from time import time
+from typing import Optional
+from urllib.parse import urldefrag
+
+from twisted.internet.defer import Deferred
+from twisted.internet.error import TimeoutError
+from twisted.web.client import URI
+
+from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
+from scrapy.core.downloader.webclient import _parse
+from scrapy.core.http2.agent import H2Agent, H2ConnectionPool, ScrapyProxyH2Agent
+from scrapy.http import Request, Response
+from scrapy.settings import Settings
+from scrapy.spiders import Spider
+from scrapy.utils.python import to_bytes
+
+
+class H2DownloadHandler:
+    def __init__(self, settings: Settings, crawler=None):
+        self._crawler = crawler
+
+        from twisted.internet import reactor
+        self._pool = H2ConnectionPool(reactor, settings)
+        self._context_factory = load_context_factory_from_settings(settings, crawler)
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler.settings, crawler)
+
+    def download_request(self, request: Request, spider: Spider) -> Deferred:
+        agent = ScrapyH2Agent(
+            context_factory=self._context_factory,
+            pool=self._pool,
+            crawler=self._crawler
+        )
+        return agent.download_request(request, spider)
+
+    def close(self) -> None:
+        self._pool.close_connections()
+
+
+class ScrapyH2Agent:
+    _Agent = H2Agent
+    _ProxyAgent = ScrapyProxyH2Agent
+
+    def __init__(
+        self, context_factory,
+        pool: H2ConnectionPool,
+        connect_timeout=10, bind_address: Optional[bytes] = None,
+        crawler=None
+    ) -> None:
+        self._context_factory = context_factory
+        self._connect_timeout = connect_timeout
+        self._bind_address = bind_address
+        self._pool = pool
+        self._crawler = crawler
+
+    def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
+        from twisted.internet import reactor
+        bind_address = request.meta.get('bindaddress') or self._bind_address
+        proxy = request.meta.get('proxy')
+        if proxy:
+            _, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
+            scheme = _parse(request.url)[0]
+            proxy_host = proxy_host.decode()
+            omit_connect_tunnel = b'noconnect' in proxy_params
+            if omit_connect_tunnel:
+                warnings.warn("Using HTTPS proxies in the noconnect mode is not supported by the "
+                              "downloader handler. If you use Crawlera, it doesn't require this "
+                              "mode anymore, so you should update scrapy-crawlera to 1.3.0+ "
+                              "and remove '?noconnect' from the Crawlera URL.")
+
+            if scheme == b'https' and not omit_connect_tunnel:
+                # ToDo
+                raise NotImplementedError('Tunneling via CONNECT method using HTTP/2.0 is not yet supported')
+            return self._ProxyAgent(
+                reactor=reactor,
+                context_factory=self._context_factory,
+                proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
+                connect_timeout=timeout,
+                bind_address=bind_address,
+                pool=self._pool
+            )
+
+        return self._Agent(
+            reactor=reactor,
+            context_factory=self._context_factory,
+            connect_timeout=timeout,
+            bind_address=bind_address,
+            pool=self._pool
+        )
+
+    def download_request(self, request: Request, spider: Spider) -> Deferred:
+        from twisted.internet import reactor
+        timeout = request.meta.get('download_timeout') or self._connect_timeout
+        agent = self._get_agent(request, timeout)
+
+        start_time = time()
+        d = agent.request(request, spider)
+        d.addCallback(self._cb_latency, request, start_time)
+
+        timeout_cl = reactor.callLater(timeout, d.cancel)
+        d.addBoth(self._cb_timeout, request, timeout, timeout_cl)
+        return d
+
+    @staticmethod
+    def _cb_latency(response: Response, request: Request, start_time: float) -> Response:
+        request.meta['download_latency'] = time() - start_time
+        return response
+
+    @staticmethod
+    def _cb_timeout(response: Response, request: Request, timeout: float, timeout_cl) -> Response:
+        if timeout_cl.active():
+            timeout_cl.cancel()
+            return response
+
+        url = urldefrag(request.url)[0]
+        raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
diff --git a/scrapy/core/http2/__init__.py b/scrapy/core/http2/__init__.py