diff --git a/.circleci/config.yml b/.circleci/config.yml index d1430b7df6..8783f33d4d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,13 +6,20 @@ jobs: working_directory: ~/draft steps: + - run: + name: "Print Configuration" + command: | + xml2rfc --version + gem list -q kramdown-rfc2629 + echo -n 'mmark '; mmark --version + - restore_cache: name: "Restoring cache - Git" keys: - - v1-cache-git-{{ .Branch }}-{{ .Revision }} - - v1-cache-git-{{ .Branch }} - - v1-cache-git-master - - v1-cache-git- + - v2-cache-git-{{ .Branch }}-{{ .Revision }} + - v2-cache-git-{{ .Branch }} + - v2-cache-git-master + - v2-cache-git- - restore_cache: name: "Restoring cache - References" @@ -20,6 +27,17 @@ jobs: - v1-cache-references-{{ epoch }} - v1-cache-references- + # Workaround for https://discuss.circleci.com/t/22437 + - run: + name: Tag Checkout + command: | + if [ -n "$CIRCLE_TAG" ] && [ -d .git ]; then + remote=$(echo "$CIRCLE_REPOSITORY_URL" | \ + sed -e 's,/^git.github.com:,https://github.com/,') + git fetch -f "$remote" "refs/tags/$CIRCLE_TAG:refs/tags/$CIRCLE_TAG" || \ + (echo 'Removing .git cache for tag build'; rm -rf .git) + fi + - checkout # Build txt and html versions of drafts @@ -31,7 +49,7 @@ jobs: - run: name: "Update GitHub Pages" command: | - if [ "${CIRCLE_TAG#draft-}" == "${CIRCLE_TAG}" ]; then + if [ "${CIRCLE_TAG#draft-}" == "$CIRCLE_TAG" ]; then make gh-pages fi @@ -39,7 +57,7 @@ jobs: - deploy: name: "Upload to Datatracker" command: | - if [ "${CIRCLE_TAG#draft-}" != "${CIRCLE_TAG}" ]; then + if [ "${CIRCLE_TAG#draft-}" != "$CIRCLE_TAG" ]; then make upload fi @@ -62,9 +80,9 @@ jobs: - save_cache: name: "Saving Cache - Git" - key: v1-cache-git-{{ .Branch }}-{{ .Revision }} + key: v2-cache-git-{{ .Branch }}-{{ .Revision }} paths: - - ~/draft + - ~/draft/.git - save_cache: name: "Saving Cache - Drafts" @@ -72,6 +90,7 @@ jobs: paths: - ~/.cache/xml2rfc + workflows: version: 2 build: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8f2149a633..b5a6fb0767 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -86,11 +86,6 @@ When a new draft is published, the design issues that have been closed since the When a design issue is `closed`, it implies that the issue has a proposed resolution that is reflected in the drafts; if a `closed` design issue is labeled with `has-consensus`, it means that the incorporated resolution has Working Group consensus. -The drafts currently in the early stage are: - -* HTTP/3 -* QPACK -* Recovery ### Late-Stage Process @@ -106,6 +101,9 @@ The drafts currently in the late stage are: * Invariants * Transport * TLS +* HTTP/3 +* QPACK +* Recovery ![diagram of the late stage workflow](workflow.png "Late Stage Workflow") diff --git a/draft-ietf-quic-http.md b/draft-ietf-quic-http.md index 24a6e623fd..7eb5963c73 100644 --- a/draft-ietf-quic-http.md +++ b/draft-ietf-quic-http.md @@ -142,7 +142,7 @@ and DATA frames form the basis of HTTP requests and responses ({{request-response}}). Multiplexing of requests is performed using the QUIC stream abstraction, -described in Section 2 of {{QUIC-TRANSPORT}}. Each request and response +described in Section 2 of {{QUIC-TRANSPORT}}. Each request-response pair consumes a single QUIC stream. Streams are independent of each other, so one stream that is blocked or suffers packet loss does not prevent progress on other streams. @@ -151,7 +151,7 @@ Server push is an interaction mode introduced in HTTP/2 {{!HTTP2}} which permits a server to push a request-response exchange to a client in anticipation of the client making the indicated request. This trades off network usage against a potential latency gain. Several HTTP/3 frames are used to manage server push, -such as PUSH_PROMISE, DUPLICATE_PUSH, MAX_PUSH_ID, and CANCEL_PUSH. +such as PUSH_PROMISE, MAX_PUSH_ID, and CANCEL_PUSH. As in HTTP/2, request and response headers are compressed for transmission. Because HPACK {{?HPACK=RFC7541}} relies on in-order transmission of compressed @@ -162,8 +162,8 @@ modifying it. ## Document Organization -The HTTP/3 specification is split into seven parts. The document begins -with a detailed overview of the connection lifecycle and key concepts: +The following sections provide a detailed overview of the connection lifecycle +and key concepts: - Connection Setup and Management ({{connection-setup}}) covers how an HTTP/3 endpoint is discovered and a connection is established. @@ -224,7 +224,7 @@ endpoint: frame: : The smallest unit of communication on a stream in HTTP/3, consisting of a - header and a variable-length sequence of octets structured according to the + header and a variable-length sequence of bytes structured according to the frame type. Protocol elements called "frames" exist in both this document and @@ -276,6 +276,11 @@ Implementations of draft versions of the protocol MUST add the string "-" and the corresponding draft number to the identifier. For example, draft-ietf-quic-http-01 is identified using the string "h3-01". +Draft versions MUST use the corresponding draft transport version as their +transport. For example, the application protocol defined in +draft-ietf-quic-http-25 uses the transport defined in +draft-ietf-quic-transport-25. + Non-compatible experiments that are based on these draft versions MUST append the string "-" and an experiment name to the identifier. For example, an experimental implementation based on draft-ietf-quic-http-09 which reserves an @@ -310,50 +315,16 @@ existing connection or try another alternative endpoint offered by the origin. Servers MAY serve HTTP/3 on any UDP port, since an alternative always includes an explicit port. -### QUIC Version Hints {#alt-svc-version-hint} - -This document defines the "quic" parameter for Alt-Svc, which MAY be used to -provide version-negotiation hints to HTTP/3 clients. QUIC versions are four-byte -sequences with no additional constraints on format. Leading zeros SHOULD be -omitted for brevity. - -Syntax of the "quic" parameter value: - -~~~ abnf -quic = DQUOTE quic-version *( "," quic-version ) DQUOTE -quic-version = 1*8HEXDIG ; hex-encoded QUIC version -~~~ - -Where multiple versions are listed, the order of the values reflects the -server's preference (with the first value being the most preferred version). -Reserved versions MAY be listed, but unreserved versions which are not supported -by the alternative SHOULD NOT be present in the list. Origins MAY omit supported -versions for any reason. - -Clients MUST ignore any included versions which they do not support. The "quic" -parameter MUST NOT occur more than once; clients SHOULD process only the first -occurrence. - -For example, suppose a server supported both version 0x00000001 and the version -rendered in ASCII as "Q034". If it also opted to include the reserved version -(from Section 15 of {{QUIC-TRANSPORT}}) 0x1abadaba, it could specify the -following header field: - -~~~ example -Alt-Svc: h3=":49288";quic="1,1abadaba,51303334" -~~~ - -A client acting on this header field would drop the reserved version (not -supported), then attempt to connect to the alternative using the first version -in the list which it does support, if any. - ## Connection Establishment {#connection-establishment} -HTTP/3 relies on QUIC as the underlying transport. The QUIC version being used -MUST use TLS version 1.3 or greater as its handshake protocol. HTTP/3 clients -MUST indicate the target domain name during the TLS handshake. This may be done -using the Server Name Indication (SNI) {{!RFC6066}} extension to TLS or using -some other mechanism. +HTTP/3 relies on QUIC version 1 as the underlying transport. The use of other +QUIC transport versions with HTTP/3 MAY be defined by future specifications. + +QUIC version 1 uses TLS version 1.3 or greater as its handshake protocol. +HTTP/3 clients MUST support a mechanism to indicate the target host to the +server during the TLS handshake. If the server is identified by a DNS name, +clients MUST send the Server Name Indication (SNI) {{!RFC6066}} TLS extension +unless an alternative mechanism to indicate the target host is used. QUIC connections are established as described in {{QUIC-TRANSPORT}}. During connection establishment, HTTP/3 support is indicated by selecting the ALPN @@ -375,7 +346,7 @@ any requests for which the client considers the server authoritative. An authoritative HTTP/3 endpoint is typically discovered because the client has received an Alt-Svc record from the request's origin which nominates the endpoint as a valid HTTP Alternative Service for that origin. As required by -{{RFC7838}}, clients MUST check that the nominated server can present a valid +{{!RFC7838}}, clients MUST check that the nominated server can present a valid certificate for the origin before considering it authoritative. Clients MUST NOT assume that an HTTP/3 endpoint is authoritative for other origins without an explicit signal. @@ -405,21 +376,35 @@ A client MUST send only a single request on a given stream. A server sends zero or more non-final HTTP responses on the same stream as the request, followed by a single final HTTP response, as detailed below. +Pushed responses are sent on a server-initiated unidirectional QUIC stream (see +{{push-streams}}). A server sends zero or more non-final HTTP responses, +followed by a single final HTTP response, in the same manner as a standard +response. Push is described in more detail in {{server-push}}. + +On a given stream, receipt of multiple requests or receipt of an additional HTTP +response following a final HTTP response MUST be treated as malformed +({{malformed}}). + An HTTP message (request or response) consists of: -1. the message header (see {{!RFC7230}}, Section 3.2), sent as a single HEADERS - frame (see {{frame-headers}}), +1. the message header (see Section 3.2 of {{!RFC7230}}), sent as a single + HEADERS frame (see {{frame-headers}}), -2. optionally, the payload body, if present (see {{!RFC7230}}, Section 3.3), +2. optionally, the payload body, if present (see Section 3.3 of {{!RFC7230}}), sent as a series of DATA frames (see {{frame-data}}), -3. optionally, trailing headers, if present (see {{!RFC7230}}, Section 4.1.2), +3. optionally, trailing headers, if present (see Section 4.1.2 of {{!RFC7230}}), sent as a single HEADERS frame. +Receipt of DATA and HEADERS frames in any other sequence MUST be treated as a +connection error of type H3_FRAME_UNEXPECTED ({{errors}}). + A server MAY send one or more PUSH_PROMISE frames (see {{frame-push-promise}}) before, after, or interleaved with the frames of a response message. These PUSH_PROMISE frames are not part of the response; see {{server-push}} for more -details. +details. These frames are not permitted in pushed responses; a pushed response +which includes PUSH_PROMISE frames MUST be treated as a connection error of type +H3_FRAME_UNEXPECTED. Frames of unknown types ({{extensions}}), including reserved frames ({{frame-reserved}}) MAY be sent on a request or push stream before, after, or @@ -434,38 +419,41 @@ The "chunked" transfer encoding defined in Section 4.1 of {{!RFC7230}} MUST NOT be used. A response MAY consist of multiple messages when and only when one or more -informational responses (1xx; see {{!RFC7231}}, Section 6.2) precede a final +informational responses (1xx; see Section 6.2 of {{!RFC7231}}) precede a final response to the same request. Non-final responses do not contain a payload body or trailers. If an endpoint receives an invalid sequence of frames on either a request or a push stream, it MUST respond with a connection error of type -HTTP_FRAME_UNEXPECTED ({{errors}}). In particular, a DATA frame before any +H3_FRAME_UNEXPECTED ({{errors}}). In particular, a DATA frame before any HEADERS frame, or a HEADERS or DATA frame after the trailing HEADERS frame is considered invalid. -An HTTP request/response exchange fully consumes a bidirectional QUIC stream. -After sending a request, a client MUST close the stream for sending. Unless -using the CONNECT method (see {{the-connect-method}}), clients MUST NOT make -stream closure dependent on receiving a response to their request. After sending -a final response, the server MUST close the stream for sending. At this point, -the QUIC stream is fully closed. +An HTTP request/response exchange fully consumes a client-initiated +bidirectional QUIC stream. After sending a request, a client MUST close the +stream for sending. Unless using the CONNECT method (see {{connect}}), clients +MUST NOT make stream closure dependent on receiving a response to their request. +After sending a final response, the server MUST close the stream for sending. At +this point, the QUIC stream is fully closed. When a stream is closed, this indicates the end of an HTTP message. Because some messages are large or unbounded, endpoints SHOULD begin processing partial HTTP messages once enough of the message has been received to make progress. If a client stream terminates without enough of the HTTP message to provide a complete response, the server SHOULD abort its response with the error code -HTTP_REQUEST_INCOMPLETE. +H3_REQUEST_INCOMPLETE. A server can send a complete response prior to the client sending an entire request if the response does not depend on any portion of the request that has -not been sent and received. When this is true, a server MAY abort reading the -request stream with error code HTTP_EARLY_RESPONSE, send a complete response, -and cleanly close the sending part of the stream. Clients MUST NOT discard -complete responses as a result of having their request terminated abruptly, -though clients can always discard responses at their discretion for other -reasons. +not been sent and received. When the server does not need to receive the +remainder of the request, it MAY abort reading the request stream, send a +complete response, and cleanly close the sending part of the stream. The error +code H3_NO_ERROR SHOULD be used when requesting that the client stop sending on +the request stream. Clients MUST NOT discard complete responses as a result of +having their request terminated abruptly, though clients can always discard +responses at their discretion for other reasons. If the server sends a partial +or complete response but does not abort reading, clients SHOULD continue sending +the body of the request and close the stream normally. ### Header Formatting and Compression {#header-formatting} @@ -483,15 +471,104 @@ field names MUST be converted to lowercase prior to their encoding. A request or response containing uppercase header field names MUST be treated as malformed ({{malformed}}). +Like HTTP/2, HTTP/3 does not use the Connection header field to indicate +connection-specific header fields; in this protocol, connection-specific +metadata is conveyed by other means. An endpoint MUST NOT generate an HTTP/3 +message containing connection-specific header fields; any message containing +connection-specific header fields MUST be treated as malformed ({{malformed}}). + +The only exception to this is the TE header field, which MAY be present in an +HTTP/3 request; when it is, it MUST NOT contain any value other than "trailers". + +This means that an intermediary transforming an HTTP/1.x message to HTTP/3 will +need to remove any header fields nominated by the Connection header field, along +with the Connection header field itself. Such intermediaries SHOULD also remove +other connection-specific header fields, such as Keep-Alive, Proxy-Connection, +Transfer-Encoding, and Upgrade, even if they are not nominated by the Connection +header field. + +#### Pseudo-Header Fields + As in HTTP/2, HTTP/3 uses special pseudo-header fields beginning with the ':' character (ASCII 0x3a) to convey the target URI, the method of the request, and -the status code for the response. These pseudo-header fields are defined in -Section 8.1.2.3 and 8.1.2.4 of {{!HTTP2}}. Pseudo-header fields are not HTTP -header fields. Endpoints MUST NOT generate pseudo-header fields other than -those defined in {{!HTTP2}}. The restrictions on the use of pseudo-header -fields in Section 8.1.2 of {{!HTTP2}} also apply to HTTP/3. Messages which -are considered malformed under these restrictions are handled as described in -{{malformed}}. +the status code for the response. + +Pseudo-header fields are not HTTP header fields. Endpoints MUST NOT generate +pseudo-header fields other than those defined in this document, except as +negotiated via an extension; see {{extensions}}. + +Pseudo-header fields are only valid in the context in which they are defined. +Pseudo-header fields defined for requests MUST NOT appear in responses; +pseudo-header fields defined for responses MUST NOT appear in requests. +Pseudo-header fields MUST NOT appear in trailers. Endpoints MUST treat a +request or response that contains undefined or invalid pseudo-header fields as +malformed ({{malformed}}). + +All pseudo-header fields MUST appear in the header block before regular header +fields. Any request or response that contains a pseudo-header field that +appears in a header block after a regular header field MUST be treated as +malformed ({{malformed}}). + +The following pseudo-header fields are defined for requests: + + ":method": + + : Contains the HTTP method ({{!RFC7231}}, Section 4) + + ":scheme": + + : Contains the scheme portion of the target URI ({{!RFC3986}}, Section 3.1) + + : ":scheme" is not restricted to "http" and "https" schemed URIs. A proxy or + gateway can translate requests for non-HTTP schemes, enabling the use of + HTTP to interact with non-HTTP services. + + ":authority": + + : Contains the authority portion of the target URI (Section 3.2 of [RFC3986]). + The authority MUST NOT include the deprecated "userinfo" subcomponent for + "http" or "https" schemed URIs. + + : To ensure that the HTTP/1.1 request line can be reproduced accurately, this + pseudo-header field MUST be omitted when translating from an HTTP/1.1 + request that has a request target in origin or asterisk form (see Section + 5.3 of [RFC7230]). Clients that generate HTTP/3 requests directly SHOULD + use the ":authority" pseudo-header field instead of the Host header field. + An intermediary that converts an HTTP/3 request to HTTP/1.1 MUST create a + Host header field if one is not present in a request by copying the value of + the ":authority" pseudo-header field. + + ":path": + + : Contains the path and query parts of the target URI (the "path-absolute" + production and optionally a '?' character followed by the "query" production + (see Sections 3.3 and 3.4 of [RFC3986]). A request in asterisk form + includes the value '*' for the ":path" pseudo-header field. + + : This pseudo-header field MUST NOT be empty for "http" or "https" URIs; + "http" or "https" URIs that do not contain a path component MUST include a + value of '/'. The exception to this rule is an OPTIONS request for an + "http" or "https" URI that does not include a path component; these MUST + include a ":path" pseudo-header field with a value of '*' (see Section 5.3.4 + of [RFC7230]). + +All HTTP/3 requests MUST include exactly one value for the ":method", ":scheme", +and ":path" pseudo-header fields, unless it is a CONNECT request ({{connect}}). +An HTTP request that omits mandatory pseudo-header fields or contains invalid +values for those fields is malformed ({{malformed}}). + +HTTP/3 does not define a way to carry the version identifier that is included in +the HTTP/1.1 request line. + +For responses, a single ":status" pseudo-header field is defined that carries +the HTTP status code field (see Section 6 of [RFC7231]). This pseudo-header +field MUST be included in all responses; otherwise, the response is malformed +({{malformed}}). + +HTTP/3 does not define a way to carry the version or reason phrase that is +included in an HTTP/1.1 status line. + +#### Header Compression HTTP/3 uses QPACK header compression as described in [QPACK], a variation of HPACK which allows the flexibility to avoid header-compression-induced @@ -501,7 +578,9 @@ To allow for better compression efficiency, the cookie header field {{!RFC6265}} MAY be split into separate header fields, each with one or more cookie-pairs, before compression. If a decompressed header list contains multiple cookie header fields, these MUST be concatenated before being passed into a non-HTTP/2, -non-HTTP/3 context, as described in {{!HTTP2}}, Section 8.1.2.5. +non-HTTP/3 context, as described in Section 8.1.2.5 of {{!HTTP2}}. + +#### Header Size Constraints An HTTP/3 implementation MAY impose a limit on the maximum size of the message header it will accept on an individual HTTP message. A server that receives a @@ -522,26 +601,26 @@ this limit are not guaranteed to be accepted. ### Request Cancellation and Rejection {#request-cancellation} Clients can cancel requests by resetting and aborting the request stream with an -error code of HTTP_REQUEST_CANCELLED ({{http-error-codes}}). When the client +error code of H3_REQUEST_CANCELLED ({{http-error-codes}}). When the client aborts reading a response, it indicates that this response is no longer of interest. Implementations SHOULD cancel requests by abruptly terminating any directions of a stream that are still open. When the server rejects a request without performing any application processing, -it SHOULD abort its response stream with the error code HTTP_REQUEST_REJECTED. +it SHOULD abort its response stream with the error code H3_REQUEST_REJECTED. In this context, "processed" means that some data from the stream was passed to some higher layer of software that might have taken some action as a result. The client can treat requests rejected by the server as though they had never been sent at all, thereby allowing them to be retried later on a new connection. -Servers MUST NOT use the HTTP_REQUEST_REJECTED error code for requests which +Servers MUST NOT use the H3_REQUEST_REJECTED error code for requests which were partially or fully processed. When a server abandons a response after partial processing, it SHOULD abort its response stream with the error code -HTTP_REQUEST_CANCELLED. +H3_REQUEST_CANCELLED. -When a client resets a request with the error code HTTP_REQUEST_CANCELLED, a +When a client resets a request with the error code H3_REQUEST_CANCELLED, a server MAY abruptly terminate the response using the error code -HTTP_REQUEST_REJECTED if no processing was performed. Clients MUST NOT use the -HTTP_REQUEST_REJECTED error code, except when a server has requested closure of +H3_REQUEST_REJECTED if no processing was performed. Clients MUST NOT use the +H3_REQUEST_REJECTED error code, except when a server has requested closure of the request stream with this error code. If a stream is cancelled after receiving a complete response, the client MAY @@ -553,9 +632,14 @@ permitted (e.g., idempotent actions like GET, PUT, or DELETE). ### Malformed Requests and Responses {#malformed} A malformed request or response is one that is an otherwise valid sequence of -frames but is invalid due to the presence of extraneous frames, prohibited -header fields, the absence of mandatory header fields, or the inclusion of -uppercase header field names. +frames but is invalid due to: + +- the presence of prohibited header fields or pseudo-header fields, +- the absence of mandatory pseudo-header fields, +- invalid values for pseudo-header fields, +- pseudo-header fields after header fields, +- an invalid sequence of HTTP messages, or +- the inclusion of uppercase header field names. A request or response that includes a payload body can include a `content-length` header field. A request or response is also malformed if the @@ -567,7 +651,7 @@ content-length header field, even though no content is included in DATA frames. Intermediaries that process HTTP requests or responses (i.e., any intermediary not acting as a tunnel) MUST NOT forward a malformed request or response. Malformed requests or responses that are detected MUST be treated as a stream -error ({{errors}}) of type HTTP_GENERAL_PROTOCOL_ERROR. +error ({{errors}}) of type H3_GENERAL_PROTOCOL_ERROR. For malformed requests, a server MAY send an HTTP response prior to closing or resetting the stream. Clients MUST NOT accept a malformed response. Note that @@ -576,9 +660,9 @@ attacks against HTTP; they are deliberately strict because being permissive can expose implementations to these vulnerabilities. -## The CONNECT Method +## The CONNECT Method {#connect} -The pseudo-method CONNECT ({{!RFC7231}}, Section 4.3.6) is primarily used with +The pseudo-method CONNECT (Section 4.3.6 of {{!RFC7231}}) is primarily used with HTTP proxies to establish a TLS session with an origin server for the purposes of interacting with "https" resources. In HTTP/1.x, CONNECT is used to convert an entire HTTP connection into a tunnel to a remote host. In HTTP/2, the CONNECT @@ -586,14 +670,14 @@ method is used to establish a tunnel over a single HTTP/2 stream to a remote host for similar purposes. A CONNECT request in HTTP/3 functions in the same manner as in HTTP/2. The -request MUST be formatted as described in {{!HTTP2}}, Section 8.3. A CONNECT +request MUST be formatted as described in Section 8.3 of {{!HTTP2}}. A CONNECT request that does not conform to these restrictions is malformed (see {{malformed}}). The request stream MUST NOT be closed at the end of the request. A proxy that supports CONNECT establishes a TCP connection ({{!RFC0793}}) to the server identified in the ":authority" pseudo-header field. Once this connection is successfully established, the proxy sends a HEADERS frame containing a 2xx -series status code to the client, as defined in {{!RFC7231}}, Section 4.3.6. +series status code to the client, as defined in Section 4.3.6 of {{!RFC7231}}. All DATA frames on the stream correspond to data sent or received on the TCP connection. Any DATA frame sent by the client is transmitted by the proxy to the @@ -604,7 +688,7 @@ map predictably to the size and number of HTTP DATA or QUIC STREAM frames. Once the CONNECT method has completed, only DATA frames are permitted to be sent on the stream. Extension frames MAY be used if specifically permitted by the definition of the extension. Receipt of any other frame type -MUST be treated as a connection error of type HTTP_FRAME_UNEXPECTED. +MUST be treated as a connection error of type H3_FRAME_UNEXPECTED. The TCP connection can be closed by either peer. When the client ends the request stream (that is, the receive stream at the proxy enters the "Data Recvd" @@ -617,7 +701,7 @@ data from the target of the CONNECT. A TCP connection error is signaled by abruptly terminating the stream. A proxy treats any error in the TCP connection, which includes receiving a TCP segment -with the RST bit set, as a stream error of type HTTP_CONNECT_ERROR +with the RST bit set, as a stream error of type H3_CONNECT_ERROR ({{http-error-codes}}). Correspondingly, if a proxy detects an error with the stream or the QUIC connection, it MUST close the TCP connection. If the underlying TCP implementation permits it, the proxy SHOULD send a TCP segment @@ -625,8 +709,9 @@ with the RST bit set. ## HTTP Upgrade -HTTP/3 does not support the HTTP Upgrade mechanism ([RFC7230], Section 6.7) or -101 (Switching Protocols) informational status code ([RFC7231], Section 6.2.2). +HTTP/3 does not support the HTTP Upgrade mechanism (Section 6.7 of [RFC7230]) or +101 (Switching Protocols) informational status code (Section 6.2.2 of +[RFC7231]). ## Server Push @@ -636,11 +721,13 @@ client making the indicated request. This trades off network usage against a potential latency gain. HTTP/3 server push is similar to what is described in HTTP/2 {{!HTTP2}}, but uses different mechanisms. -Each server push is identified by a unique Push ID. This Push ID is used in a -single PUSH_PROMISE frame (see {{frame-push-promise}}) which carries the request -headers, possibly included in one or more DUPLICATE_PUSH frames (see -{{frame-duplicate-push}}), then included with the push stream which ultimately -fulfills those promises. +Each server push is identified by a unique Push ID. This Push ID is used in one +or more PUSH_PROMISE frames (see {{frame-push-promise}}) that carry the request +headers, then included with the push stream which ultimately fulfills those +promises. When the same Push ID is promised on multiple request streams, the +decompressed request header sets MUST contain the same fields in the +same order, and both the name and the value in each field MUST be exact +matches. Server push is only enabled on a connection when a client sends a MAX_PUSH_ID frame (see {{frame-max-push-id}}). A server cannot use server push until it @@ -648,44 +735,41 @@ receives a MAX_PUSH_ID frame. A client sends additional MAX_PUSH_ID frames to control the number of pushes that a server can promise. A server SHOULD use Push IDs sequentially, starting at 0. A client MUST treat receipt of a push stream with a Push ID that is greater than the maximum Push ID as a connection error of -type HTTP_ID_ERROR. +type H3_ID_ERROR. The header of the request message is carried by a PUSH_PROMISE frame (see -{{frame-push-promise}}) on the request stream which generated the push. This -allows the server push to be associated with a client request. Promised +{{frame-push-promise}}) on the request stream which generated the push. Promised requests MUST conform to the requirements in Section 8.2 of {{!HTTP2}}. -The same server push can be associated with additional client requests using a -DUPLICATE_PUSH frame (see {{frame-duplicate-push}}). +Each pushed response is associated with one or more client requests. The push +is associated with the request stream on which the PUSH_PROMISE frame was +received. The same server push can be associated with additional client +requests using a PUSH_PROMISE frame with the same Push ID on multiple request +streams. These associations do not affect the operation of the protocol, but +MAY be considered by user agents when deciding how to use pushed resources. -Ordering of a PUSH_PROMISE or DUPLICATE_PUSH in relation to certain parts of the -response is important. The server SHOULD send PUSH_PROMISE or DUPLICATE_PUSH -frames prior to sending HEADERS or DATA frames that reference the promised -responses. This reduces the chance that a client requests a resource that will -be pushed by the server. +Ordering of a PUSH_PROMISE in relation to certain parts of the response is +important. The server SHOULD send PUSH_PROMISE frames prior to sending HEADERS +or DATA frames that reference the promised responses. This reduces the chance +that a client requests a resource that will be pushed by the server. When a server later fulfills a promise, the server push response is conveyed on a push stream (see {{push-streams}}). The push stream identifies the Push ID of the promise that it fulfills, then contains a response to the promised request using the same format described for responses in {{request-response}}. -Due to reordering, DUPLICATE_PUSH frames or push stream data can arrive before -the corresponding PUSH_PROMISE frame. When a client receives a DUPLICATE_PUSH -frame for an as-yet-unknown Push ID, the request headers of the push are not -immediately available. The client can either delay generating new requests for -content referenced following the DUPLICATE_PUSH frame until the request headers -become available, or can initiate requests for discovered resources and cancel -the requests if the requested resource is already being pushed. When a client -receives a new push stream with an as-yet-unknown Push ID, both the associated -client request and the pushed request headers are unknown. The client can -buffer the stream data in expectation of the matching PUSH_PROMISE. The client -can use stream flow control (see section 4.1 of {{QUIC-TRANSPORT}}) to limit the -amount of data a server may commit to the pushed stream. +Due to reordering, push stream data can arrive before the corresponding +PUSH_PROMISE frame. When a client receives a new push stream with an +as-yet-unknown Push ID, both the associated client request and the pushed +request headers are unknown. The client can buffer the stream data in +expectation of the matching PUSH_PROMISE. The client can use stream flow control +(see section 4.1 of {{QUIC-TRANSPORT}}) to limit the amount of data a server may +commit to the pushed stream. If a promised server push is not needed by the client, the client SHOULD send a CANCEL_PUSH frame. If the push stream is already open or opens after sending the CANCEL_PUSH frame, the client can abort reading the stream with an error code of -HTTP_REQUEST_CANCELLED. This asks the server not to transfer additional data and +H3_REQUEST_CANCELLED. This asks the server not to transfer additional data and indicates that it will be discarded upon receipt. # Connection Closure @@ -758,19 +842,22 @@ A client that is unable to retry requests loses all requests that are in flight when the server closes the connection. A server MAY send multiple GOAWAY frames indicating different stream IDs, but MUST NOT increase the value they send in the last Stream ID, since clients might already have retried unprocessed -requests on another connection. A server that is attempting to gracefully shut -down a connection SHOULD send an initial GOAWAY frame with the last Stream ID -set to the maximum value allowed by QUIC's MAX_STREAMS and SHOULD NOT increase -the MAX_STREAMS limit thereafter. This signals to the client that a shutdown is -imminent and that initiating further requests is prohibited. After allowing -time for any in-flight requests (at least one round-trip time), the server MAY -send another GOAWAY frame with an updated last Stream ID. This ensures that a -connection can be cleanly shut down without losing requests. +requests on another connection. + +A server that is attempting to gracefully shut down a connection can send an +initial GOAWAY frame with the last Stream ID set to the maximum possible value +for a client-initiated, bidirectional stream (i.e. 2^62-4 in case of QUIC +version 1). This GOAWAY frame signals to the client that shutdown is imminent +and that initiating further requests is prohibited. After allowing time for any +in-flight requests to reach the server, the server can send another GOAWAY frame +indicating which requests it will accept before the end of the connection. This +ensures that a connection can be cleanly shut down without causing requests to +fail. Once all accepted requests have been processed, the server can permit the connection to become idle, or MAY initiate an immediate closure of the connection. An endpoint that completes a graceful shutdown SHOULD use the -HTTP_NO_ERROR code when closing the connection. +H3_NO_ERROR code when closing the connection. If a client has consumed all available bidirectional stream IDs with requests, the server need not send a GOAWAY frame, since the client is unable to make @@ -827,13 +914,13 @@ correlated with the request. This means that the client's first request occurs on QUIC stream 0, with subsequent requests on stream 4, 8, and so on. In order to permit these streams to open, an HTTP/3 server SHOULD configure non-zero minimum values for the number of permitted streams and the initial stream flow -control window. It is RECOMMENDED that at least 100 requests be permitted at a -time, so as to not unnecessarily limit parallelism. +control window. So as to not unnecessarily limit parallelism, at least 100 +requests SHOULD be permitted at a time. HTTP/3 does not use server-initiated bidirectional streams, though an extension could define a use for these streams. Clients MUST treat receipt of a server-initiated bidirectional stream as a connection error of type -HTTP_STREAM_CREATION_ERROR unless such an extension has been negotiated. +H3_STREAM_CREATION_ERROR unless such an extension has been negotiated. ## Unidirectional Streams @@ -881,7 +968,7 @@ create additional streams as allowed by their peer. If the stream header indicates a stream type which is not supported by the recipient, the remainder of the stream cannot be consumed as the semantics are unknown. Recipients of unknown stream types MAY abort reading of the stream with -an error code of HTTP_STREAM_CREATION_ERROR, but MUST NOT consider such streams +an error code of H3_STREAM_CREATION_ERROR, but MUST NOT consider such streams to be a connection error of any kind. Implementations MAY send stream types before knowing whether the peer supports @@ -901,13 +988,13 @@ consists of HTTP/3 frames, as defined in {{frames}}. Each side MUST initiate a single control stream at the beginning of the connection and send its SETTINGS frame as the first frame on this stream. If the first frame of the control stream is any other frame type, this MUST be -treated as a connection error of type HTTP_MISSING_SETTINGS. Only one control +treated as a connection error of type H3_MISSING_SETTINGS. Only one control stream per peer is permitted; receipt of a second stream which claims to be a control stream MUST be treated as a connection error of type -HTTP_STREAM_CREATION_ERROR. The sender MUST NOT close the control stream, and +H3_STREAM_CREATION_ERROR. The sender MUST NOT close the control stream, and the receiver MUST NOT request that the sender close the control stream. If either control stream is closed at any point, this MUST be treated as a -connection error of type HTTP_CLOSED_CRITICAL_STREAM. +connection error of type H3_CLOSED_CRITICAL_STREAM. A pair of unidirectional streams is used rather than a single bidirectional stream. This allows either peer to send data as soon as it is able. Depending @@ -929,7 +1016,7 @@ responses followed by a single final HTTP response, as defined in {{server-push}}. Only servers can push; if a server receives a client-initiated push stream, this -MUST be treated as a connection error of type HTTP_STREAM_CREATION_ERROR. +MUST be treated as a connection error of type H3_STREAM_CREATION_ERROR. ~~~~~~~~~~ drawing 0 1 2 3 @@ -944,7 +1031,7 @@ MUST be treated as a connection error of type HTTP_STREAM_CREATION_ERROR. Each Push ID MUST only be used once in a push stream header. If a push stream header includes a Push ID that was used in another push stream header, the -client MUST treat this as a connection error of type HTTP_ID_ERROR. +client MUST treat this as a connection error of type H3_ID_ERROR. ### Reserved Stream Types {#stream-grease} @@ -956,7 +1043,9 @@ transferred. Endpoints MUST NOT consider these streams to have any meaning upon receipt. The payload and length of the stream are selected in any manner the -implementation chooses. +implementation chooses. Implementations MAY terminate these streams cleanly, or +MAY abruptly terminate them. When terminating abruptly, the error code +H3_NO_ERROR or a reserved error code ({{http-error-codes}}) SHOULD be used. # HTTP Framing Layer {#http-framing-layer} @@ -976,9 +1065,8 @@ comparison between HTTP/2 and HTTP/3 frames is provided in {{h2-frames}}. | PUSH_PROMISE | No | Yes | No | {{frame-push-promise}} | | GOAWAY | Yes | No | No | {{frame-goaway}} | | MAX_PUSH_ID | Yes | No | No | {{frame-max-push-id}} | -| DUPLICATE_PUSH | No | Yes | No | {{frame-duplicate-push}} | | Reserved | Yes | Yes | Yes | {{frame-reserved}} | -{: #stream-frame-mapping title="HTTP/3 frames and stream type overview"} +{: #stream-frame-mapping title="HTTP/3 Frames and Stream Type Overview"} Certain frames can only occur as the first frame of a particular stream type; these are indicated in {{stream-frame-mapping}} with a (1). Specific guidance @@ -1001,7 +1089,7 @@ All frames have the following format: | Frame Payload (*) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~~~~~~~~ -{: #fig-frame title="HTTP/3 frame format"} +{: #fig-frame title="HTTP/3 Frame Format"} A frame includes the following fields: @@ -1009,7 +1097,8 @@ A frame includes the following fields: : A variable-length integer that identifies the frame type. Length: - : A variable-length integer that describes the length of the Frame Payload. + : A variable-length integer that describes the length in bytes of + the Frame Payload. Frame Payload: : A payload, the semantics of which are determined by the Type field. @@ -1018,11 +1107,11 @@ Each frame's payload MUST contain exactly the fields identified in its description. A frame payload that contains additional bytes after the identified fields or a frame payload that terminates before the end of the identified fields MUST be treated as a connection error of type -HTTP_FRAME_ERROR. +H3_FRAME_ERROR. When a stream terminates cleanly, if the last frame on the stream was truncated, this MUST be treated as a connection error ({{errors}}) of type -HTTP_FRAME_ERROR. Streams which terminate abruptly may be reset at any point in +H3_FRAME_ERROR. Streams which terminate abruptly may be reset at any point in a frame. ## Frame Definitions {#frames} @@ -1034,7 +1123,7 @@ associated with an HTTP request or response payload. DATA frames MUST be associated with an HTTP request or response. If a DATA frame is received on a control stream, the recipient MUST respond with a -connection error ({{errors}}) of type HTTP_FRAME_UNEXPECTED. +connection error ({{errors}}) of type H3_FRAME_UNEXPECTED. ~~~~~~~~~~ drawing 0 1 2 3 @@ -1043,7 +1132,7 @@ connection error ({{errors}}) of type HTTP_FRAME_UNEXPECTED. | Payload (*) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~~~~~~~~ -{: #fig-data title="DATA frame payload"} +{: #fig-data title="DATA Frame Payload"} ### HEADERS {#frame-headers} @@ -1057,11 +1146,11 @@ QPACK. See [QPACK] for more details. | Header Block (*) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~~~~~~~~ -{: #fig-headers title="HEADERS frame payload"} +{: #fig-headers title="HEADERS Frame Payload"} HEADERS frames can only be sent on request / push streams. If a HEADERS frame is received on a control stream, the recipient MUST respond with a connection -error ({{errors}}) of type HTTP_FRAME_UNEXPECTED. +error ({{errors}}) of type H3_FRAME_UNEXPECTED. ### CANCEL_PUSH {#frame-cancel-push} @@ -1089,7 +1178,7 @@ has already received a corresponding push stream. A CANCEL_PUSH frame is sent on the control stream. Receiving a CANCEL_PUSH frame on a stream other than the control stream MUST be treated as a connection -error of type HTTP_FRAME_UNEXPECTED. +error of type H3_FRAME_UNEXPECTED. ~~~~~~~~~~ drawing 0 1 2 3 @@ -1098,19 +1187,19 @@ error of type HTTP_FRAME_UNEXPECTED. | Push ID (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~~~~~~~~ -{: #fig-cancel-push title="CANCEL_PUSH frame payload"} +{: #fig-cancel-push title="CANCEL_PUSH Frame Payload"} The CANCEL_PUSH frame carries a Push ID encoded as a variable-length integer. The Push ID identifies the server push that is being cancelled (see {{frame-push-promise}}). If a CANCEL_PUSH frame is received which references a Push ID greater than currently allowed on the connection, this MUST be treated -as a connection error of type HTTP_ID_ERROR. +as a connection error of type H3_ID_ERROR. If the client receives a CANCEL_PUSH frame, that frame might identify a Push ID that has not yet been mentioned by a PUSH_PROMISE frame due to reordering. If a server receives a CANCEL_PUSH frame for a Push ID that has not yet been mentioned by a PUSH_PROMISE frame, this MUST be treated as a connection error of -type HTTP_ID_ERROR. +type H3_ID_ERROR. ### SETTINGS {#frame-settings} @@ -1125,11 +1214,11 @@ SETTINGS frames always apply to a connection, never a single stream. A SETTINGS frame MUST be sent as the first frame of each control stream (see {{control-streams}}) by each peer, and MUST NOT be sent subsequently. If an endpoint receives a second SETTINGS frame on the control stream, the endpoint -MUST respond with a connection error of type HTTP_FRAME_UNEXPECTED. +MUST respond with a connection error of type H3_FRAME_UNEXPECTED. SETTINGS frames MUST NOT be sent on any stream other than the control stream. If an endpoint receives a SETTINGS frame on a different stream, the endpoint -MUST respond with a connection error of type HTTP_FRAME_UNEXPECTED. +MUST respond with a connection error of type H3_FRAME_UNEXPECTED. SETTINGS parameters are not negotiated; they describe characteristics of the sending peer, which can be used by the receiving peer. However, a negotiation @@ -1144,7 +1233,7 @@ while servers are more cautious about request size. The same setting identifier MUST NOT occur more than once in the SETTINGS frame. A receiver MAY treat the presence of duplicate setting identifiers as a -connection error of type HTTP_SETTINGS_ERROR. +connection error of type H3_SETTINGS_ERROR. The payload of a SETTINGS frame consists of zero or more parameters. Each parameter consists of a setting identifier and a value, both encoded as QUIC @@ -1159,7 +1248,7 @@ variable-length integers. | Value (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~~~~~~~~~~~~~ -{: #fig-ext-settings title="SETTINGS parameter format"} +{: #fig-ext-settings title="SETTINGS Parameter Format"} An implementation MUST ignore the contents for any SETTINGS identifier it does not understand. @@ -1229,10 +1318,13 @@ A server MAY accept 0-RTT and subsequently provide different settings in its SETTINGS frame. If 0-RTT data is accepted by the server, its SETTINGS frame MUST NOT reduce any limits or alter any values that might be violated by the client with its 0-RTT data. The server MUST include all settings which differ from -their default values. If a server accepts 0-RTT, but then sends a SETTINGS -frame which reduces a setting the client understands or omits a value that was -previously specified to have a non-default value, this MUST be treated as a -connection error of type HTTP_SETTINGS_ERROR. +their default values. If a server accepts 0-RTT but then sends settings that +are not compatible with the previously specified settings, this MUST be treated +as a connection error of type H3_SETTINGS_ERROR. If a server accepts 0-RTT but +then sends a SETTINGS frame that omits a setting value that the client +understands (apart from reserved setting identifiers) that was previously +specified to have a non-default value, this MUST be treated as a connection +error of type H3_SETTINGS_ERROR. ### PUSH_PROMISE {#frame-push-promise} @@ -1249,14 +1341,14 @@ set from server to client on a request stream, as in HTTP/2. | Header Block (*) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~~~~~~~~ -{: #fig-push-promise title="PUSH_PROMISE frame payload"} +{: #fig-push-promise title="PUSH_PROMISE Frame Payload"} The payload consists of: Push ID: : A variable-length integer that identifies the server push operation. A Push ID is used in push stream headers ({{server-push}}), CANCEL_PUSH frames - ({{frame-cancel-push}}), and DUPLICATE_PUSH frames ({{frame-duplicate-push}}). + ({{frame-cancel-push}}). Header Block: : QPACK-compressed request header fields for the promised response. See [QPACK] @@ -1265,17 +1357,30 @@ Header Block: A server MUST NOT use a Push ID that is larger than the client has provided in a MAX_PUSH_ID frame ({{frame-max-push-id}}). A client MUST treat receipt of a PUSH_PROMISE frame that contains a larger Push ID than the client has advertised -as a connection error of HTTP_ID_ERROR. +as a connection error of H3_ID_ERROR. + +A server MAY use the same Push ID in multiple PUSH_PROMISE frames. If so, the +decompressed request header sets MUST contain the same fields in the same +order, and both the name and and value in each field MUST be exact +matches. Clients SHOULD compare the request header sets for resources promised +multiple times. If a client receives a Push ID that has already been promised +and detects a mismatch, it MUST respond with a connection error of type +H3_GENERAL_PROTOCOL_ERROR. If the decompressed header sets match exactly, the +client SHOULD associate the pushed content with each stream on which +a PUSH_PROMISE was received. -A server MUST NOT use the same Push ID in multiple PUSH_PROMISE frames. A client -MUST treat receipt of a Push ID which has already been promised as a connection -error of type HTTP_ID_ERROR. +Allowing duplicate references to the same Push ID is primarily to reduce +duplication caused by concurrent requests. A server SHOULD avoid reusing a Push +ID over a long period. Clients are likely to consume server push responses and +not retain them for reuse over time. Clients that see a PUSH_PROMISE that uses +a Push ID that they have already consumed and discarded are forced to ignore the +PUSH_PROMISE. If a PUSH_PROMISE frame is received on the control stream, the client MUST -respond with a connection error ({{errors}}) of type HTTP_FRAME_UNEXPECTED. +respond with a connection error ({{errors}}) of type H3_FRAME_UNEXPECTED. A client MUST NOT send a PUSH_PROMISE frame. A server MUST treat the receipt -of a PUSH_PROMISE frame as a connection error of type HTTP_FRAME_UNEXPECTED. +of a PUSH_PROMISE frame as a connection error of type H3_FRAME_UNEXPECTED. See {{server-push}} for a description of the overall server push mechanism. @@ -1294,20 +1399,20 @@ close a connection. | Stream ID (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~~~~~~~~ -{: #fig-goaway title="GOAWAY frame payload"} +{: #fig-goaway title="GOAWAY Frame Payload"} The GOAWAY frame is always sent on the control stream. It carries a QUIC Stream ID for a client-initiated bidirectional stream encoded as a variable-length integer. A client MUST treat receipt of a GOAWAY frame containing a Stream ID -of any other type as a connection error of type HTTP_ID_ERROR. +of any other type as a connection error of type H3_ID_ERROR. Clients do not need to send GOAWAY to initiate a graceful shutdown; they simply stop making new requests. A server MUST treat receipt of a GOAWAY frame on any -stream as a connection error ({{errors}}) of type HTTP_FRAME_UNEXPECTED. +stream as a connection error ({{errors}}) of type H3_FRAME_UNEXPECTED. The GOAWAY frame applies to the connection, not a specific stream. A client MUST treat a GOAWAY frame on a stream other than the control stream as a -connection error ({{errors}}) of type HTTP_FRAME_UNEXPECTED. +connection error ({{errors}}) of type H3_FRAME_UNEXPECTED. See {{connection-shutdown}} for more information on the use of the GOAWAY frame. @@ -1321,10 +1426,10 @@ initiate in addition to the limit maintained by the QUIC transport. The MAX_PUSH_ID frame is always sent on the control stream. Receipt of a MAX_PUSH_ID frame on any other stream MUST be treated as a connection error of -type HTTP_FRAME_UNEXPECTED. +type H3_FRAME_UNEXPECTED. A server MUST NOT send a MAX_PUSH_ID frame. A client MUST treat the receipt of -a MAX_PUSH_ID frame as a connection error of type HTTP_FRAME_UNEXPECTED. +a MAX_PUSH_ID frame as a connection error of type H3_FRAME_UNEXPECTED. The maximum Push ID is unset when a connection is created, meaning that a server cannot push until it receives a MAX_PUSH_ID frame. A client that wishes to @@ -1338,55 +1443,13 @@ sending MAX_PUSH_ID frames as the server fulfills or cancels server pushes. | Push ID (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~~~~~~~~ -{: #fig-max-push title="MAX_PUSH_ID frame payload"} +{: #fig-max-push title="MAX_PUSH_ID Frame Payload"} The MAX_PUSH_ID frame carries a single variable-length integer that identifies the maximum value for a Push ID that the server can use (see {{frame-push-promise}}). A MAX_PUSH_ID frame cannot reduce the maximum Push ID; receipt of a MAX_PUSH_ID that contains a smaller value than previously received -MUST be treated as a connection error of type HTTP_ID_ERROR. - -### DUPLICATE_PUSH {#frame-duplicate-push} - -The DUPLICATE_PUSH frame (type=0xE) is used by servers to indicate that an -existing pushed resource is related to multiple client requests. - -The DUPLICATE_PUSH frame is always sent on a request stream. Receipt of a -DUPLICATE_PUSH frame on any other stream MUST be treated as a connection error -of type HTTP_FRAME_UNEXPECTED. - -A client MUST NOT send a DUPLICATE_PUSH frame. A server MUST treat the receipt -of a DUPLICATE_PUSH frame as a connection error of type HTTP_FRAME_UNEXPECTED. - -~~~~~~~~~~ drawing - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| Push ID (i) ... -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -~~~~~~~~~~ -{: #fig-duplicate-push title="DUPLICATE_PUSH frame payload"} - -The DUPLICATE_PUSH frame carries a single variable-length integer that -identifies the Push ID of a resource that the server has previously promised -(see {{frame-push-promise}}), though that promise might not be received before -this frame. A server MUST NOT use a Push ID that is larger than the client has -provided in a MAX_PUSH_ID frame ({{frame-max-push-id}}). A client MUST treat -receipt of a DUPLICATE_PUSH that contains a larger Push ID than the client has -advertised as a connection error of type HTTP_ID_ERROR. - -This frame allows the server to use the same server push in response to multiple -concurrent requests. Referencing the same server push ensures that a promise -can be made in relation to every response in which server push might be needed -without duplicating request headers or pushed responses. - -Allowing duplicate references to the same Push ID is primarily to reduce -duplication caused by concurrent requests. A server SHOULD avoid reusing a Push -ID over a long period. Clients are likely to consume server push responses and -not retain them for reuse over time. Clients that see a DUPLICATE_PUSH that -uses a Push ID that they have since consumed and discarded are forced to ignore -the DUPLICATE_PUSH. - +MUST be treated as a connection error of type H3_ID_ERROR. ### Reserved Frame Types {#frame-reserved} @@ -1402,7 +1465,7 @@ implementation chooses. Frame types which were used in HTTP/2 where there is no corresponding HTTP/3 frame have also been reserved ({{iana-frames}}). These frame types MUST NOT be -sent, and receipt MAY be treated as an error of type HTTP_FRAME_UNEXPECTED. +sent, and receipt MAY be treated as an error of type H3_FRAME_UNEXPECTED. # Error Handling {#errors} @@ -1410,13 +1473,16 @@ sent, and receipt MAY be treated as an error of type HTTP_FRAME_UNEXPECTED. QUIC allows the application to abruptly terminate (reset) individual streams or the entire connection when an error is encountered. These are referred to as "stream errors" or "connection errors" and are described in more detail in -{{QUIC-TRANSPORT}}. An endpoint MAY choose to treat a stream error as a -connection error. +{{QUIC-TRANSPORT}}. + +An endpoint MAY choose to treat a stream error as a connection error under +certain circumstances. Implementations need to consider the impact on +outstanding requests before making this choice. Because new error codes can be defined without negotiation (see {{extensions}}), -receipt of an unknown error code or use of an error code in an unexpected -context MUST NOT be treated as an error. However, closing a stream can -constitute an error regardless of the error code (see {{request-response}}). +use of an error code in an unexpected context or receipt of an unknown error +code MUST be treated as equivalent to H3_NO_ERROR. However, closing a stream +can have other effects regardless of the error code (see {{request-response}}). This section describes HTTP/3-specific error codes which can be used to express the cause of a connection or stream error. @@ -1426,66 +1492,67 @@ the cause of a connection or stream error. The following error codes are defined for use when abruptly terminating streams, aborting reading of streams, or immediately closing connections. -HTTP_NO_ERROR (0x100): +H3_NO_ERROR (0x100): : No error. This is used when the connection or stream needs to be closed, but there is no error to signal. -HTTP_GENERAL_PROTOCOL_ERROR (0x101): +H3_GENERAL_PROTOCOL_ERROR (0x101): : Peer violated protocol requirements in a way which doesn't match a more specific error code, or endpoint declines to use the more specific error code. -HTTP_INTERNAL_ERROR (0x102): +H3_INTERNAL_ERROR (0x102): : An internal error has occurred in the HTTP stack. -HTTP_STREAM_CREATION_ERROR (0x103): +H3_STREAM_CREATION_ERROR (0x103): : The endpoint detected that its peer created a stream that it will not accept. -HTTP_CLOSED_CRITICAL_STREAM (0x104): +H3_CLOSED_CRITICAL_STREAM (0x104): : A stream required by the connection was closed or reset. -HTTP_FRAME_UNEXPECTED (0x105): +H3_FRAME_UNEXPECTED (0x105): : A frame was received which was not permitted in the current state or on the current stream. -HTTP_FRAME_ERROR (0x106): +H3_FRAME_ERROR (0x106): : A frame that fails to satisfy layout requirements or with an invalid size was received. -HTTP_EXCESSIVE_LOAD (0x107): +H3_EXCESSIVE_LOAD (0x107): : The endpoint detected that its peer is exhibiting a behavior that might be generating excessive load. -HTTP_ID_ERROR (0x108): +H3_ID_ERROR (0x108): : A Stream ID or Push ID was used incorrectly, such as exceeding a limit, reducing a limit, or being reused. -HTTP_SETTINGS_ERROR (0x109): +H3_SETTINGS_ERROR (0x109): : An endpoint detected an error in the payload of a SETTINGS frame. -HTTP_MISSING_SETTINGS (0x10A): +H3_MISSING_SETTINGS (0x10A): : No SETTINGS frame was received at the beginning of the control stream. -HTTP_REQUEST_REJECTED (0x10B): +H3_REQUEST_REJECTED (0x10B): : A server rejected a request without performing any application processing. -HTTP_REQUEST_CANCELLED (0x10C): +H3_REQUEST_CANCELLED (0x10C): : The request or its response (including pushed response) is cancelled. -HTTP_REQUEST_INCOMPLETE (0x10D): +H3_REQUEST_INCOMPLETE (0x10D): : The client's stream terminated without containing a fully-formed request. -HTTP_EARLY_RESPONSE (0x10E): -: The remainder of the client's request is not needed to produce a response. - For use in STOP_SENDING only. - -HTTP_CONNECT_ERROR (0x10F): +H3_CONNECT_ERROR (0x10F): : The connection established in response to a CONNECT request was reset or abnormally closed. -HTTP_VERSION_FALLBACK (0x110): +H3_VERSION_FALLBACK (0x110): : The requested operation cannot be served over HTTP/3. The peer should retry over HTTP/1.1. +Error codes of the format `0x1f * N + 0x21` for integer values of N are reserved +to exercise the requirement that unknown error codes be treated as equivalent to +H3_NO_ERROR ({{extensions}}). Implementations SHOULD select an error code from +this space with some probability when they would have sent H3_NO_ERROR. + # Extensions to HTTP/3 {#extensions} HTTP/3 permits extension of the protocol. Within the limitations described in @@ -1516,8 +1583,9 @@ requirement and SHOULD be treated as an error. Extensions that could change the semantics of existing protocol components MUST be negotiated before being used. For example, an extension that changes the layout of the HEADERS frame cannot be used until the peer has given a positive -signal that this is acceptable. In this case, it could also be necessary to -coordinate when the revised layout comes into effect. +signal that this is acceptable. Coordinating when such a revised layout comes +into effect could prove complex. As such, allocating new identifiers for +new definitions of existing protocol elements is likely to be more effective. This document doesn't mandate a specific method for negotiating the use of an extension but notes that a setting ({{settings-parameters}}) could be used for @@ -1542,8 +1610,10 @@ Where HTTP/2 employs PADDING frames and Padding fields in other frames to make a connection more resistant to traffic analysis, HTTP/3 can either rely on transport-layer padding or employ the reserved frame and stream types discussed in {{frame-reserved}} and {{stream-grease}}. These methods of padding produce -different results in terms of the granularity of padding, the effect of packet -loss and recovery, and how an implementation might control padding. +different results in terms of the granularity of padding, how padding is +arranged in relation to the information that is being protected, whether +padding is applied in the case of packet loss, and how an implementation might +control padding. ## Frame Parsing @@ -1571,7 +1641,10 @@ accept that the original address might change. # IANA Considerations -## Registration of HTTP/3 Identification String +This document registers a new ALPN protocol ID ({{iana-alpn}}) and creates new +registries that manage the assignment of codepoints in HTTP/3. + +## Registration of HTTP/3 Identification String {#iana-alpn} This document creates a new registration for the identification of HTTP/3 in the "Application Layer Protocol Negotiation (ALPN) @@ -1588,26 +1661,24 @@ The "h3" string identifies HTTP/3: Specification: : This document -## Registration of QUIC Version Hint Alt-Svc Parameter - -This document creates a new registration for version-negotiation hints in the -"Hypertext Transfer Protocol (HTTP) Alt-Svc Parameter" registry established in -{{!RFC7838}}. +## New Registries {#iana-policy} - Parameter: - : "quic" +New registries created in this document operate under the QUIC registration +policy documented in Section 22.1 of {{QUIC-TRANSPORT}}. These registries all +include the common set of fields listed in Section 22.1.1 of {{QUIC-TRANSPORT}}. - Specification: - : This document, {{alt-svc-version-hint}} +The initial allocations in these registries created in this document are all +assigned permanent status and list as contact both the IESG (iesg@ietf.org) and +the HTTP working group (ietf-http-wg@w3.org). -## Frame Types {#iana-frames} +### Frame Types {#iana-frames} This document establishes a registry for HTTP/3 frame type codes. The "HTTP/3 -Frame Type" registry governs a 62-bit space. This space is split into three -spaces that are governed by different policies. Values between `0x00` and `0x3f` -(in hexadecimal) are assigned via the Standards Action or IESG Review policies -{{!RFC8126}}. Values from `0x40` to `0x3fff` operate on the Specification -Required policy {{!RFC8126}}. All other values are assigned to Private Use +Frame Type" registry governs a 62-bit space. This registry follows the QUIC +registry policy; see {{iana-policy}}. Permanent registrations in this registry +are assigned using the Specification Required policy {{!RFC8126}}, except for +values between 0x00 and 0x3f (in hexadecimal; inclusive), which are assigned +using Standards Action or IESG Approval as defined in Section 4.9 and 4.10 of {{!RFC8126}}. While this registry is separate from the "HTTP/2 Frame Type" registry defined in @@ -1616,23 +1687,19 @@ code spaces overlap. If an entry is present in only one registry, every effort SHOULD be made to avoid assigning the corresponding value to an unrelated operation. -New entries in this registry require the following information: +In addition to common fields as described in {{iana-policy}}, permanent +registrations in this registry MUST include the following field: Frame Type: : A name or label for the frame type. -Code: -: The 62-bit code assigned to the frame type. - -Specification: -: A reference to a specification that includes a description of the frame layout - and its semantics, including any parts of the frame that are conditionally - present. +Specifications of frame types MUST include a description of the frame layout and +its semantics, including any parts of the frame that are conditionally present. -The entries in the following table are registered by this document. +The entries in {{iana-frame-table}} are registered by this document. | ---------------- | ------ | -------------------------- | -| Frame Type | Code | Specification | +| Frame Type | Value | Specification | | ---------------- | :----: | -------------------------- | | DATA | 0x0 | {{frame-data}} | | HEADERS | 0x1 | {{frame-headers}} | @@ -1645,49 +1712,42 @@ The entries in the following table are registered by this document. | Reserved | 0x8 | N/A | | Reserved | 0x9 | N/A | | MAX_PUSH_ID | 0xD | {{frame-max-push-id}} | -| DUPLICATE_PUSH | 0xE | {{frame-duplicate-push}} | | ---------------- | ------ | -------------------------- | +{: #iana-frame-table title="Initial HTTP/3 Frame Types"} Additionally, each code of the format `0x1f * N + 0x21` for integer values of N (that is, `0x21`, `0x40`, ..., through `0x3FFFFFFFFFFFFFFE`) MUST NOT be assigned by IANA. -## Settings Parameters {#iana-settings} +### Settings Parameters {#iana-settings} This document establishes a registry for HTTP/3 settings. The "HTTP/3 Settings" -registry governs a 62-bit space. This space is split into three spaces that are -governed by different policies. Values between `0x00` and `0x3f` (in -hexadecimal) are assigned via the Standards Action or IESG Review policies -{{!RFC8126}}. Values from `0x40` to `0x3fff` operate on the Specification -Required policy {{!RFC8126}}. All other values are assigned to Private Use -{{!RFC8126}}. The designated experts are the same as those for the "HTTP/2 -Settings" registry defined in {{!HTTP2}}. +registry governs a 62-bit space. This registry follows the QUIC registry +policy; see {{iana-policy}}. Permanent registrations in this registry are +assigned using the Specification Required policy {{!RFC8126}}, except for values +between 0x00 and 0x3f (in hexadecimal; inclusive), which are assigned using +Standards Action or IESG Approval as defined in Section 4.9 and 4.10 of +{{!RFC8126}}. While this registry is separate from the "HTTP/2 Settings" registry defined in {{!HTTP2}}, it is preferable that the assignments parallel each other. If an entry is present in only one registry, every effort SHOULD be made to avoid assigning the corresponding value to an unrelated operation. -New registrations are advised to provide the following information: +In addition to common fields as described in {{iana-policy}}, permanent +registrations in this registry MUST include the following fields: -Name: +Setting Name: : A symbolic name for the setting. Specifying a setting name is optional. -Code: -: The 62-bit code assigned to the setting. - -Specification: -: An optional reference to a specification that describes the use of the - setting. - Default: -: The value of the setting unless otherwise indicated. SHOULD be the most - restrictive possible value. +: The value of the setting unless otherwise indicated. A default SHOULD be the + most restrictive possible value. -The entries in the following table are registered by this document. +The entries in {{iana-setting-table}} are registered by this document. | ---------------------------- | ------ | ------------------------- | --------- | -| Setting Name | Code | Specification | Default | +| Setting Name | Value | Specification | Default | | ---------------------------- | :----: | ------------------------- | --------- | | Reserved | 0x2 | N/A | N/A | | Reserved | 0x3 | N/A | N/A | @@ -1695,91 +1755,91 @@ The entries in the following table are registered by this document. | Reserved | 0x5 | N/A | N/A | | MAX_HEADER_LIST_SIZE | 0x6 | {{settings-parameters}} | Unlimited | | ---------------------------- | ------ | ------------------------- | --------- | +{: #iana-setting-table title="Initial HTTP/3 Settings"} Additionally, each code of the format `0x1f * N + 0x21` for integer values of N (that is, `0x21`, `0x40`, ..., through `0x3FFFFFFFFFFFFFFE`) MUST NOT be assigned by IANA. -## Error Codes {#iana-error-codes} +### Error Codes {#iana-error-codes} This document establishes a registry for HTTP/3 error codes. The "HTTP/3 Error -Code" registry manages a 62-bit space. The "HTTP/3 Error Code" registry -operates under the "Expert Review" policy {{?RFC8126}}. +Code" registry manages a 62-bit space. This registry follows the QUIC registry +policy; see {{iana-policy}}. Permanent registrations in this registry are +assigned using the Specification Required policy {{!RFC8126}}, except for values +between 0x00 and 0x3f (in hexadecimal; inclusive), which are assigned using +Standards Action or IESG Approval as defined in Section 4.9 and 4.10 of +{{!RFC8126}}. Registrations for error codes are required to include a description of the error code. An expert reviewer is advised to examine new registrations for possible duplication with existing error codes. Use of existing registrations is to be encouraged, but not mandated. -New registrations are advised to provide the following information: +In addition to common fields as described in {{iana-policy}}, permanent +registrations in this registry MUST include the following fields: Name: : A name for the error code. Specifying an error code name is optional. -Code: -: The 62-bit error code value. - Description: -: A brief description of the error code semantics, longer if no detailed - specification is provided. +: A brief description of the error code semantics. + +The entries in the {{iana-error-table}} are registered by this document. + +| --------------------------------- | ---------- | ---------------------------------------- | ---------------------- | +| Name | Value | Description | Specification | +| --------------------------------- | ---------- | ---------------------------------------- | ---------------------- | +| H3_NO_ERROR | 0x0100 | No error | {{http-error-codes}} | +| H3_GENERAL_PROTOCOL_ERROR | 0x0101 | General protocol error | {{http-error-codes}} | +| H3_INTERNAL_ERROR | 0x0102 | Internal error | {{http-error-codes}} | +| H3_STREAM_CREATION_ERROR | 0x0103 | Stream creation error | {{http-error-codes}} | +| H3_CLOSED_CRITICAL_STREAM | 0x0104 | Critical stream was closed | {{http-error-codes}} | +| H3_FRAME_UNEXPECTED | 0x0105 | Frame not permitted in the current state | {{http-error-codes}} | +| H3_FRAME_ERROR | 0x0106 | Frame violated layout or size rules | {{http-error-codes}} | +| H3_EXCESSIVE_LOAD | 0x0107 | Peer generating excessive load | {{http-error-codes}} | +| H3_ID_ERROR | 0x0108 | An identifier was used incorrectly | {{http-error-codes}} | +| H3_SETTINGS_ERROR | 0x0109 | SETTINGS frame contained invalid values | {{http-error-codes}} | +| H3_MISSING_SETTINGS | 0x010A | No SETTINGS frame received | {{http-error-codes}} | +| H3_REQUEST_REJECTED | 0x010B | Request not processed | {{http-error-codes}} | +| H3_REQUEST_CANCELLED | 0x010C | Data no longer needed | {{http-error-codes}} | +| H3_REQUEST_INCOMPLETE | 0x010D | Stream terminated early | {{http-error-codes}} | +| H3_CONNECT_ERROR | 0x010F | TCP reset or error on CONNECT request | {{http-error-codes}} | +| H3_VERSION_FALLBACK | 0x0110 | Retry over HTTP/1.1 | {{http-error-codes}} | +| --------------------------------- | ---------- | ---------------------------------------- | ---------------------- | +{: #iana-error-table title="Initial HTTP/3 Error Codes"} -Specification: -: An optional reference for a specification that defines the error code. - -The entries in the following table are registered by this document. +Additionally, each code of the format `0x1f * N + 0x21` for integer values of N +(that is, `0x21`, `0x40`, ..., through `0x3FFFFFFFFFFFFFFE`) MUST NOT be +assigned by IANA. -| ----------------------------------- | ---------- | ---------------------------------------- | ---------------------- | -| Name | Code | Description | Specification | -| ----------------------------------- | ---------- | ---------------------------------------- | ---------------------- | -| HTTP_NO_ERROR | 0x0100 | No error | {{http-error-codes}} | -| HTTP_GENERAL_PROTOCOL_ERROR | 0x0101 | General protocol error | {{http-error-codes}} | -| HTTP_INTERNAL_ERROR | 0x0102 | Internal error | {{http-error-codes}} | -| HTTP_STREAM_CREATION_ERROR | 0x0103 | Stream creation error | {{http-error-codes}} | -| HTTP_CLOSED_CRITICAL_STREAM | 0x0104 | Critical stream was closed | {{http-error-codes}} | -| HTTP_FRAME_UNEXPECTED | 0x0105 | Frame not permitted in the current state | {{http-error-codes}} | -| HTTP_FRAME_ERROR | 0x0106 | Frame violated layout or size rules | {{http-error-codes}} | -| HTTP_EXCESSIVE_LOAD | 0x0107 | Peer generating excessive load | {{http-error-codes}} | -| HTTP_ID_ERROR | 0x0108 | An identifier was used incorrectly | {{http-error-codes}} | -| HTTP_SETTINGS_ERROR | 0x0109 | SETTINGS frame contained invalid values | {{http-error-codes}} | -| HTTP_MISSING_SETTINGS | 0x010A | No SETTINGS frame received | {{http-error-codes}} | -| HTTP_REQUEST_REJECTED | 0x010B | Request not processed | {{http-error-codes}} | -| HTTP_REQUEST_CANCELLED | 0x010C | Data no longer needed | {{http-error-codes}} | -| HTTP_REQUEST_INCOMPLETE | 0x010D | Stream terminated early | {{http-error-codes}} | -| HTTP_EARLY_RESPONSE | 0x010E | Remainder of request not needed | {{http-error-codes}} | -| HTTP_CONNECT_ERROR | 0x010F | TCP reset or error on CONNECT request | {{http-error-codes}} | -| HTTP_VERSION_FALLBACK | 0x0110 | Retry over HTTP/1.1 | {{http-error-codes}} | -| ----------------------------------- | ---------- | ---------------------------------------- | ---------------------- | - -## Stream Types {#iana-stream-types} +### Stream Types {#iana-stream-types} This document establishes a registry for HTTP/3 unidirectional stream types. The -"HTTP/3 Stream Type" registry governs a 62-bit space. This space is split into -three spaces that are governed by different policies. Values between `0x00` and -0x3f (in hexadecimal) are assigned via the Standards Action or IESG Review -policies {{!RFC8126}}. Values from `0x40` to `0x3fff` operate on the -Specification Required policy {{!RFC8126}}. All other values are assigned to -Private Use {{!RFC8126}}. +"HTTP/3 Stream Type" registry governs a 62-bit space. This registry follows the +QUIC registry policy; see {{iana-policy}}. Permanent registrations in this +registry are assigned using the Specification Required policy {{!RFC8126}}, +except for values between 0x00 and 0x3f (in hexadecimal; inclusive), which are +assigned using Standards Action or IESG Approval as defined in Section 4.9 and +4.10 of {{!RFC8126}}. -New entries in this registry require the following information: +In addition to common fields as described in {{iana-policy}}, permanent +registrations in this registry MUST include the following fields: Stream Type: : A name or label for the stream type. -Code: -: The 62-bit code assigned to the stream type. - -Specification: -: A reference to a specification that includes a description of the stream type, - including the layout semantics of its payload. - Sender: : Which endpoint on a connection may initiate a stream of this type. Values are "Client", "Server", or "Both". +Specifications for permanent registrations MUST include a description of the +stream type, including the layout semantics of the stream contents. + The entries in the following table are registered by this document. | ---------------- | ------ | -------------------------- | ------ | -| Stream Type | Code | Specification | Sender | +| Stream Type | Value | Specification | Sender | | ---------------- | :----: | -------------------------- | ------ | | Control Stream | 0x00 | {{control-streams}} | Both | | Push Stream | 0x01 | {{server-push}} | Server | @@ -1814,6 +1874,21 @@ considerations about exhaustion of stream identifier space apply, though the space is significantly larger such that it is likely that other limits in QUIC are reached first, such as the limit on the connection flow control window. +In contrast to HTTP/2, stream concurrency in HTTP/3 is managed by QUIC. QUIC +considers a stream closed when all data has been received and sent data has been +acknowledged by the peer. HTTP/2 considers a stream closed when the frame +containing the END_STREAM bit has been committed to the transport. As a result, +the stream for an equivalent exchange could remain "active" for a longer period +of time. HTTP/3 servers might choose to permit a larger number of concurrent +client-initiated bidirectional streams to achieve equivalent concurrency to +HTTP/2, depending on the expected usage patterns. + +Due to the presence of other unidirectional stream types, HTTP/3 does not rely +exclusively on the number of concurrent unidirectional streams to control the +number of concurrent in-flight pushes. Instead, HTTP/3 clients use the +MAX_PUSH_ID frame to control the number of pushes received from an HTTP/3 +server. + ## HTTP Frame Types {#h2-frames} Many framing concepts from HTTP/2 can be elided on QUIC, because the transport @@ -1969,7 +2044,8 @@ In HTTP/3, setting values are variable-length integers (6, 14, 30, or 62 bits long) rather than fixed-length 32-bit fields as in HTTP/2. This will often produce a shorter encoding, but can produce a longer encoding for settings which use the full 32-bit space. Settings ported from HTTP/2 might choose to redefine -the format of their settings to avoid using the 62-bit encoding. +their value to limit it to 30 bits for more efficient encoding, or to make use +of the 62-bit space if more than 30 bits are required. Settings need to be defined separately for HTTP/2 and HTTP/3. The IDs of settings defined in {{!HTTP2}} have been reserved for simplicity. Note that @@ -1993,15 +2069,15 @@ The HTTP/2 error codes defined in Section 7 of {{!HTTP2}} logically map to the HTTP/3 error codes as follows: NO_ERROR (0x0): -: HTTP_NO_ERROR in {{http-error-codes}}. +: H3_NO_ERROR in {{http-error-codes}}. PROTOCOL_ERROR (0x1): -: This is mapped to HTTP_GENERAL_PROTOCOL_ERROR except in cases where more - specific error codes have been defined. This includes HTTP_FRAME_UNEXPECTED - and HTTP_CLOSED_CRITICAL_STREAM defined in {{http-error-codes}}. +: This is mapped to H3_GENERAL_PROTOCOL_ERROR except in cases where more + specific error codes have been defined. This includes H3_FRAME_UNEXPECTED + and H3_CLOSED_CRITICAL_STREAM defined in {{http-error-codes}}. INTERNAL_ERROR (0x2): -: HTTP_INTERNAL_ERROR in {{http-error-codes}}. +: H3_INTERNAL_ERROR in {{http-error-codes}}. FLOW_CONTROL_ERROR (0x3): : Not applicable, since QUIC handles flow control. @@ -2013,31 +2089,31 @@ STREAM_CLOSED (0x5): : Not applicable, since QUIC handles stream management. FRAME_SIZE_ERROR (0x6): -: HTTP_FRAME_ERROR error code defined in {{http-error-codes}}. +: H3_FRAME_ERROR error code defined in {{http-error-codes}}. REFUSED_STREAM (0x7): -: HTTP_REQUEST_REJECTED (in {{http-error-codes}}) is used to indicate that a +: H3_REQUEST_REJECTED (in {{http-error-codes}}) is used to indicate that a request was not processed. Otherwise, not applicable because QUIC handles stream management. CANCEL (0x8): -: HTTP_REQUEST_CANCELLED in {{http-error-codes}}. +: H3_REQUEST_CANCELLED in {{http-error-codes}}. COMPRESSION_ERROR (0x9): : Multiple error codes are defined in [QPACK]. CONNECT_ERROR (0xa): -: HTTP_CONNECT_ERROR in {{http-error-codes}}. +: H3_CONNECT_ERROR in {{http-error-codes}}. ENHANCE_YOUR_CALM (0xb): -: HTTP_EXCESSIVE_LOAD in {{http-error-codes}}. +: H3_EXCESSIVE_LOAD in {{http-error-codes}}. INADEQUATE_SECURITY (0xc): : Not applicable, since QUIC is assumed to provide sufficient security on all connections. -HTTP_1_1_REQUIRED (0xd): -: HTTP_VERSION_FALLBACK in {{http-error-codes}}. +H3_1_1_REQUIRED (0xd): +: H3_VERSION_FALLBACK in {{http-error-codes}}. Error codes need to be defined for HTTP/2 and HTTP/3 separately. See {{iana-error-codes}}. @@ -2047,6 +2123,25 @@ Error codes need to be defined for HTTP/2 and HTTP/3 separately. See > **RFC Editor's Note:** Please remove this section prior to publication of a > final version of this document. +## Since draft-ietf-quic-http-25 + +- Require QUICv1 for HTTP/3 (#3117, #3323) +- Remove DUPLICATE_PUSH and allow duplicate PUSH_PROMISE (#3275, #3309) +- Clarify the definition of "malformed" (#3352, #3345) + +## Since draft-ietf-quic-http-24 + +- Removed H3_EARLY_RESPONSE error code; H3_NO_ERROR is recommended instead + (#3130,#3208) +- Unknown error codes are equivalent to H3_NO_ERROR (#3276,#3331) +- Some error codes are reserved for greasing (#3325,#3360) + +## Since draft-ietf-quic-http-23 + +- Removed `quic` Alt-Svc parameter (#3061,#3118) +- Clients need not persist unknown settings for use in 0-RTT (#3110,#3113) +- Clarify error cases around CANCEL_PUSH (#2819,#3083) + ## Since draft-ietf-quic-http-22 - Removed priority signaling (#2922,#2924) @@ -2075,7 +2170,7 @@ Error codes need to be defined for HTTP/2 and HTTP/3 separately. See ## Since draft-ietf-quic-http-21 -- No changes +No changes ## Since draft-ietf-quic-http-20 diff --git a/draft-ietf-quic-qpack.md b/draft-ietf-quic-qpack.md index d861e753e6..1e0fb7e724 100644 --- a/draft-ietf-quic-qpack.md +++ b/draft-ietf-quic-qpack.md @@ -84,7 +84,7 @@ code and issues list for this draft can be found at The QUIC transport protocol {{QUIC-TRANSPORT}} is designed to support HTTP semantics, and its design subsumes many of the features of HTTP/2 {{?RFC7540}}. -HTTP/2 uses HPACK ({{!RFC7541}}) for header compression. If HPACK were used for +HTTP/2 uses HPACK {{!RFC7541}} for header compression. If HPACK were used for HTTP/3 {{HTTP3}}, it would induce head-of-line blocking due to built-in assumptions of a total ordering across frames on all streams. @@ -158,10 +158,10 @@ x ... # Compression Process Overview Like HPACK, QPACK uses two tables for associating header fields to indices. The -static table (see {{table-static}}) is predefined and contains common header -fields (some of them with an empty value). The dynamic table (see -{{table-dynamic}}) is built up over the course of the connection and can be used -by the encoder to index header fields in the encoded header lists. +static table ({{header-table-static}}) is predefined and contains common header +fields (some of them with an empty value). The dynamic table +({{header-table-dynamic}}) is built up over the course of the connection and can +be used by the encoder to index header fields in the encoded header lists. QPACK defines unidirectional streams for sending instructions from encoder to decoder and vice versa. @@ -169,8 +169,8 @@ decoder and vice versa. ## Encoder An encoder converts a header list into a header block by emitting either an -indexed or a literal representation for each header field in the list (see -{{header-block-representations}}). Indexed representations achieve high +indexed or a literal representation for each header field in the list; see +{{header-block-representations}}. Indexed representations achieve high compression by replacing the literal name and possibly the value with an index to either the static or dynamic table. References to the static table and literal representations do not require any dynamic state and never risk @@ -192,45 +192,45 @@ while the decoder is relatively simple. An encoder MUST ensure that a header block which references a dynamic table entry is not processed by the decoder after the referenced entry has been -evicted. Hence the encoder needs to track information about each compressed +evicted. Hence the encoder needs to retain information about each compressed header block that references the dynamic table until that header block is -acknowledged by the decoder (see {{header-acknowledgement}}). +acknowledged by the decoder; see {{header-acknowledgement}}. -### Blocked Dynamic Table Insertions {#blocked-insertion} +### Limits on Dynamic Table Insertions {#blocked-insertion} -A dynamic table entry is considered blocking and cannot be evicted until its -insertion has been acknowledged and there are no outstanding unacknowledged -references to the entry. In particular, a dynamic table entry that has never -been referenced can still be blocking. +Inserting entries into the dynamic table might not be possible if the table +contains entries which cannot be evicted. -Note: -: A blocking entry is unrelated to a blocked stream, which is a stream that a - decoder cannot decode as a result of references to entries that are not yet - available. An encoder that uses the dynamic table has to keep track of - blocked entries. +A dynamic table entry cannot be evicted immediately after insertion, even if it +has never been referenced. Once the insertion of a dynamic table entry has been +acknowledged and there are no outstanding unacknowledged references to the +entry, the entry becomes evictable. -An encoder MUST NOT insert an entry into the dynamic table (or duplicate an -existing entry) if doing so would evict a blocking entry. +If the dynamic table does not contain enough room for a new entry without +evicting other entries, and the entries which would be evicted are not +evictable, the encoder MUST NOT insert that entry into the dynamic table +(including duplicates of existing entries). In order to avoid this, an encoder +that uses the dynamic table has to keep track of whether each entry is currently +evictable or not. - -#### Avoiding Blocked Insertions +#### Avoiding Prohibited Insertions To ensure that the encoder is not prevented from adding new entries, the encoder can avoid referencing entries that are close to eviction. Rather than -reference such an entry, the encoder can emit a Duplicate instruction (see -{{duplicate}}), and reference the duplicate instead. +reference such an entry, the encoder can emit a Duplicate instruction +({{duplicate}}), and reference the duplicate instead. Determining which entries are too close to eviction to reference is an encoder preference. One heuristic is to target a fixed amount of available space in the dynamic table: either unused space or space that can be reclaimed by evicting non-blocking entries. To achieve this, the encoder can maintain a draining -index, which is the smallest absolute index (see {{indexing}}) in the dynamic -table that it will emit a reference for. As new entries are inserted, the -encoder increases the draining index to maintain the section of the table that -it will not reference. If the encoder does not create new references to entries -with an absolute index lower than the draining index, the number of -unacknowledged references to those entries will eventually become zero, allowing -them to be evicted. +index, which is the smallest absolute index ({{indexing}}) in the dynamic table +that it will emit a reference for. As new entries are inserted, the encoder +increases the draining index to maintain the section of the table that it will +not reference. If the encoder does not create new references to entries with an +absolute index lower than the draining index, the number of unacknowledged +references to those entries will eventually become zero, allowing them to be +evicted. ~~~~~~~~~~ drawing +----------+---------------------------------+--------+ @@ -251,7 +251,7 @@ Because QUIC does not guarantee order between data on different streams, a decoder might encounter a header block that references a dynamic table entry that it has not yet received. -Each header block contains a Required Insert Count (see {{header-prefix}}), the +Each header block contains a Required Insert Count ({{header-prefix}}), the lowest possible value for the Insert Count with which the header block can be decoded. For a header block with references to the dynamic table, the Required Insert Count is one larger than the largest absolute index of all referenced @@ -260,14 +260,14 @@ table, the Required Insert Count is zero. When the decoder receives a header block with a Required Insert Count greater than its own Insert Count, the stream cannot be processed immediately, and is -considered "blocked" (see {{blocked-decoding}}). +considered "blocked"; see {{blocked-decoding}}. The decoder specifies an upper bound on the number of streams which can be -blocked using the SETTINGS_QPACK_BLOCKED_STREAMS setting (see -{{configuration}}). An encoder MUST limit the number of streams which could -become blocked to the value of SETTINGS_QPACK_BLOCKED_STREAMS at all times. -If an decoder encounters more blocked streams than it promised to support, it -MUST treat this as a connection error of type HTTP_QPACK_DECOMPRESSION_FAILED. +blocked using the SETTINGS_QPACK_BLOCKED_STREAMS setting; see {{configuration}}. +An encoder MUST limit the number of streams which could become blocked to the +value of SETTINGS_QPACK_BLOCKED_STREAMS at all times. If a decoder encounters +more blocked streams than it promised to support, it MUST treat this as a +connection error of type QPACK_DECOMPRESSION_FAILED. Note that the decoder might not become blocked on every stream which risks becoming blocked. @@ -276,37 +276,56 @@ An encoder can decide whether to risk having a stream become blocked. If permitted by the value of SETTINGS_QPACK_BLOCKED_STREAMS, compression efficiency can often be improved by referencing dynamic table entries that are still in transit, but if there is loss or reordering the stream can become blocked at the -decoder. An encoder avoids the risk of blocking by only referencing dynamic +decoder. An encoder can avoid the risk of blocking by only referencing dynamic table entries which have been acknowledged, but this could mean using literals. Since literals make the header block larger, this can result in the encoder becoming blocked on congestion or flow control limits. -### Known Received Count +### Avoiding Flow Control Deadlocks + +Writing instructions on streams that are limited by flow control can produce +deadlocks. + +A decoder might stop issuing flow control credit on the stream that carries a +header block until the necessary updates are received on the encoder +stream. If the granting of flow control credit on the encoder stream (or the +connection as a whole) depends on the consumption and release of data on the +stream carrying the header block, a deadlock might result. + +More generally, a stream containing a large instruction can become deadlocked if +the decoder withholds flow control credit until the instruction is completely +received. -In order to identify which dynamic table entries can be safely used without a -stream becoming blocked, the encoder tracks the number of entries received by -the decoder. The Known Received Count tracks the total number of acknowledged -insertions. +To avoid these deadlocks, an encoder SHOULD avoid writing an instruction unless +sufficient stream and connection flow control credit is available for the entire +instruction. -When blocking references are permitted, the encoder uses Header Acknowledgement -instructions ({{header-acknowledgement}}) to maintain the Known Received -Count. If a header block was potentially blocking, the acknowledgement implies -that the decoder has received all dynamic table state necessary to process the -header block. If the Required Insert Count of an acknowledged header block was -greater than the encoder's current Known Received Count, the block's Required -Insert Count becomes the new Known Received Count. +### Known Received Count + +The Known Received Count is the total number of dynamic table insertions and +duplications acknowledged by the decoder. The encoder tracks the Known Received +Count in order to identify which dynamic table entries can be referenced without +potentially blocking a stream. The decoder tracks the Known Received Count in +order to be able to send Insert Count Increment instructions. -To acknowledge dynamic table entries which are not referenced by header blocks, -for example because the encoder or the decoder have chosen not to risk blocked -streams, the decoder sends an Insert Count Increment instruction (see -{{insert-count-increment}}). +A Header Acknowledgement instruction ({{header-acknowledgement}}) implies that +the decoder has received all dynamic table state necessary to process +corresponding the header block. If the Required Insert Count of the +acknowledged header block is greater than the current Known Received Count, +Known Received Count is updated to the value of the Required Insert Count. +An Insert Count Increment instruction {{insert-count-increment}} increases the +Known Received Count by its Increment parameter. See {{new-table-entries}} for +guidance. ## Decoder As in HPACK, the decoder processes header blocks and emits the corresponding -header lists. It also processes dynamic table modifications from encoder -instructions received on the encoder stream. +header lists. It also processes instructions received on the encoder stream that +modify the dynamic table. Note that header blocks and encoder stream +instructions arrive on separate streams. This is unlike HPACK, where header +blocks can contain instructions that modify the dynamic table, and there is no +dedicated stream of HPACK instructions. The decoder MUST emit header fields in the order their representations appear in the input header block. @@ -322,14 +341,13 @@ While blocked, header block data SHOULD remain in the blocked stream's flow control window. A stream becomes unblocked when the Insert Count becomes greater than or equal to the Required Insert Count for all header blocks the decoder has started reading from the stream. - When processing header blocks, the decoder expects the Required Insert Count to exactly match the value defined in {{blocked-streams}}. If it encounters a smaller value than expected, it MUST treat this as a connection error of type -HTTP_QPACK_DECOMPRESSION_FAILED (see {{invalid-references}}). If it encounters a +QPACK_DECOMPRESSION_FAILED; see {{invalid-references}}. If it encounters a larger value than expected, it MAY treat this as a connection error of type -HTTP_QPACK_DECOMPRESSION_FAILED. +QPACK_DECOMPRESSION_FAILED. ### State Synchronization @@ -350,30 +368,29 @@ given stream. When an endpoint receives a stream reset before the end of a stream or before all header blocks are processed on that stream, or when it abandons reading of a -stream, it generates a Stream Cancellation instruction (see -{{stream-cancellation}}). This signals to the encoder that all references to -the dynamic table on that stream are no longer outstanding. A decoder with a -maximum dynamic table capacity equal to zero (see -{{maximum-dynamic-table-capacity}}) MAY omit sending Stream Cancellations, -because the encoder cannot have any dynamic table references. An encoder cannot -infer from this instruction that any updates to the dynamic table have been -received. +stream, it generates a Stream Cancellation instruction; see +{{stream-cancellation}}. This signals to the encoder that all references to the +dynamic table on that stream are no longer outstanding. A decoder with a +maximum dynamic table capacity ({{maximum-dynamic-table-capacity}}) equal to +zero MAY omit sending Stream Cancellations, because the encoder cannot have +any dynamic table references. An encoder cannot infer from this instruction +that any updates to the dynamic table have been received. The Header Acknowledgement and Stream Cancellation instructions permit the encoder to remove references to entries in the dynamic table. When an entry with absolute index lower than the Known Received Count has zero references, -then it is no longer considered blocking (see {{blocked-insertion}}). +then it is considered evictable; see {{blocked-insertion}}. #### New Table Entries After receiving new table entries on the encoder stream, the decoder chooses -when to emit Insert Count Increment instructions (see -{{insert-count-increment}}). Emitting this instruction after adding each new +when to emit Insert Count Increment instructions; see +{{insert-count-increment}}. Emitting this instruction after adding each new dynamic table entry will provide the timeliest feedback to the encoder, but could be redundant with other decoder feedback. By delaying an Insert Count Increment instruction, the decoder might be able to coalesce multiple Insert Count Increment instructions, or replace them entirely with Header -Acknowledgements (see {{header-acknowledgement}}). However, delaying too long +Acknowledgements; see {{header-acknowledgement}}. However, delaying too long may lead to compression inefficiencies if the encoder waits for an entry to be acknowledged before using it. @@ -381,13 +398,13 @@ acknowledged before using it. If the decoder encounters a reference in a header block representation to a dynamic table entry which has already been evicted or which has an absolute -index greater than or equal to the declared Required Insert Count (see -{{header-prefix}}), it MUST treat this as a connection error of type -`HTTP_QPACK_DECOMPRESSION_FAILED`. +index greater than or equal to the declared Required Insert Count +({{header-prefix}}), it MUST treat this as a connection error of type +QPACK_DECOMPRESSION_FAILED. If the decoder encounters a reference in an encoder instruction to a dynamic table entry which has already been evicted, it MUST treat this as a connection -error of type `HTTP_QPACK_ENCODER_STREAM_ERROR`. +error of type QPACK_ENCODER_STREAM_ERROR. # Header Tables @@ -396,7 +413,7 @@ Unlike in HPACK, entries in the QPACK static and dynamic tables are addressed separately. The following sections describe how entries in each table are addressed. -## Static Table {#table-static} +## Static Table {#header-table-static} The static table consists of a predefined static list of header fields, each of which has a fixed index over time. Its entries are defined in {{static-table}}. @@ -405,21 +422,20 @@ All entries in the static table have a name and a value. However, values can be empty (that is, have a length of 0). Each entry is identified by a unique index. -Note the QPACK static table is indexed from 0, whereas the HPACK static table -is indexed from 1. +Note that the QPACK static table is indexed from 0, whereas the HPACK static +table is indexed from 1. When the decoder encounters an invalid static table index in a header block representation it MUST treat this as a connection error of type -`HTTP_QPACK_DECOMPRESSION_FAILED`. If this index is received on the encoder -stream, this MUST be treated as a connection error of type -`HTTP_QPACK_ENCODER_STREAM_ERROR`. +QPACK_DECOMPRESSION_FAILED. If this index is received on the encoder stream, +this MUST be treated as a connection error of type QPACK_ENCODER_STREAM_ERROR. -## Dynamic Table {#table-dynamic} +## Dynamic Table {#header-table-dynamic} The dynamic table consists of a list of header fields maintained in first-in, first-out order. Each HTTP/3 endpoint holds a dynamic table that is initially -empty. Entries are added by encoder instructions received on the encoder stream -(see {{encoder-instructions}}). +empty. Entries are added by encoder instructions received on the encoder +stream; see {{encoder-instructions}}. The dynamic table can contain duplicate entries (i.e., entries with the same name and same value). Therefore, duplicate entries MUST NOT be treated as an @@ -431,12 +447,9 @@ Dynamic table entries can have empty values. The size of the dynamic table is the sum of the size of its entries. -The size of an entry is the sum of its name's length in bytes (as defined in -{{string-literals}}), its value's length in bytes, and 32. - -The size of an entry is calculated using the length of its name and value -without Huffman encoding applied. - +The size of an entry is the sum of its name's length in bytes, its value's +length in bytes, and 32. The size of an entry is calculated using the length of +its name and value without Huffman encoding applied. ### Dynamic Table Capacity and Eviction {#eviction} @@ -448,20 +461,21 @@ table. Before a new entry is added to the dynamic table, entries are evicted from the end of the dynamic table until the size of the dynamic table is less than or -equal to (table capacity - size of new entry). The encoder MUST NOT evict a -blocking dynamic table entry (see {{blocked-insertion}}). The entry is then -added to the table. It is an error if the encoder attempts to add an entry that -is larger than the dynamic table capacity; the decoder MUST treat this as a -connection error of type `HTTP_QPACK_ENCODER_STREAM_ERROR`. +equal to (table capacity - size of new entry). The encoder MUST NOT cause a +dynamic table entry to be evicted unless that entry is evictable; see +{{blocked-insertion}}. The new entry is then added to the table. It is an +error if the encoder attempts to add an entry that is larger than the dynamic +table capacity; the decoder MUST treat this as a connection error of type +QPACK_ENCODER_STREAM_ERROR. A new entry can reference an entry in the dynamic table that will be evicted when adding this new entry into the dynamic table. Implementations are cautioned to avoid deleting the referenced name or value if the referenced entry is evicted from the dynamic table prior to inserting the new entry. -Whenever the dynamic table capacity is reduced by the encoder (see -{{set-dynamic-capacity}}), entries are evicted from the end of the dynamic table -until the size of the dynamic table is less than or equal to the new table +Whenever the dynamic table capacity is reduced by the encoder +({{set-dynamic-capacity}}), entries are evicted from the end of the dynamic +table until the size of the dynamic table is less than or equal to the new table capacity. This mechanism can be used to completely clear entries from the dynamic table by setting a capacity of 0, which can subsequently be restored. @@ -471,35 +485,34 @@ dynamic table by setting a capacity of 0, which can subsequently be restored. To bound the memory requirements of the decoder, the decoder limits the maximum value the encoder is permitted to set for the dynamic table capacity. In HTTP/3, this limit is determined by the value of -SETTINGS_QPACK_MAX_TABLE_CAPACITY sent by the decoder (see {{configuration}}). +SETTINGS_QPACK_MAX_TABLE_CAPACITY sent by the decoder; see {{configuration}}. The encoder MUST not set a dynamic table capacity that exceeds this maximum, but -it can choose to use a lower dynamic table capacity (see -{{set-dynamic-capacity}}). +it can choose to use a lower dynamic table capacity; see +{{set-dynamic-capacity}}. For clients using 0-RTT data in HTTP/3, the server's maximum table capacity is the remembered value of the setting, or zero if the value was not previously -sent. When the client's 0-RTT value of the SETTING is 0, the server MAY set it -to a non-zero value in its SETTINGS frame. If the remembered value is non-zero, -the server MUST send the same non-zero value in its SETTINGS frame. If it -specifies any other value, or omits SETTINGS_QPACK_MAX_TABLE_CAPACITY from +sent. When the client's 0-RTT value of the SETTING is zero, the server MAY set +it to a non-zero value in its SETTINGS frame. If the remembered value is +non-zero, the server MUST send the same non-zero value in its SETTINGS frame. If +it specifies any other value, or omits SETTINGS_QPACK_MAX_TABLE_CAPACITY from SETTINGS, the encoder must treat this as a connection error of type -`HTTP_QPACK_DECODER_STREAM_ERROR`. +QPACK_DECODER_STREAM_ERROR. For HTTP/3 servers and HTTP/3 clients when 0-RTT is not attempted or is rejected, the maximum table capacity is 0 until the encoder processes a SETTINGS frame with a non-zero value of SETTINGS_QPACK_MAX_TABLE_CAPACITY. -When the maximum table capacity is 0, the encoder MUST NOT insert entries into -the dynamic table, and MUST NOT send any encoder instructions on the encoder -stream. +When the maximum table capacity is zero, the encoder MUST NOT insert entries +into the dynamic table, and MUST NOT send any encoder instructions on the +encoder stream. ### Absolute Indexing {#indexing} -Each entry possesses both an absolute index which is fixed for the lifetime of -that entry and a relative index which changes based on the context of the -reference. The first entry inserted has an absolute index of "0"; indices -increase by one with each insertion. +Each entry possesses an absolute index which is fixed for the lifetime of that +entry. The first entry inserted has an absolute index of "0"; indices increase +by one with each insertion. ### Relative Indexing @@ -508,10 +521,10 @@ Relative indices begin at zero and increase in the opposite direction from the absolute index. Determining which entry has a relative index of "0" depends on the context of the reference. -In encoder instructions, a relative index of "0" always refers to the most -recently inserted value in the dynamic table. Note that this means the entry -referenced by a given relative index will change while interpreting instructions -on the encoder stream. +In encoder instructions ({{encoder-instructions}}), a relative index of "0" +refers to the most recently inserted value in the dynamic table. Note that this +means the entry referenced by a given relative index will change while +interpreting instructions on the encoder stream. ~~~~~ drawing +-----+---------------+-------+ @@ -528,20 +541,18 @@ d = count of entries dropped ~~~~~ {: title="Example Dynamic Table Indexing - Encoder Stream"} -Unlike encoder instructions, relative indices in header block representations -are relative to the Base at the beginning of the header block (see -{{header-prefix}}). This ensures that references are stable even if header +Unlike in encoder instructions, relative indices in header block representations +are relative to the Base at the beginning of the header block; see +{{header-prefix}}. This ensures that references are stable even if header blocks and dynamic table updates are processed out of order. In a header block a relative index of "0" refers to the entry with absolute index equal to Base - 1. ~~~~~ drawing - Required - Insert - Count Base - | | - V V + Base + | + V +-----+-----+-----+-----+-------+ | n-1 | n-2 | n-3 | ... | d | Absolute Index +-----+-----+ - +-----+ - + @@ -550,15 +561,17 @@ index equal to Base - 1. n = count of entries inserted d = count of entries dropped +In this example, Base = n - 2 ~~~~~ {: title="Example Dynamic Table Indexing - Relative Index in Header Block"} ### Post-Base Indexing {#post-base} -Post-Base indices are used for entries with absolute indexes greater than or -equal to Base, starting at 0 for the entry with absolute index equal to Base, -and increasing in the same direction as the absolute index. +Post-Base indices are used in header block instructions for entries with +absolute indices greater than or equal to Base, starting at 0 for the entry with +absolute index equal to Base, and increasing in the same direction as the +absolute index. Post-Base indices allow an encoder to process a header block in a single pass and include references to entries added while processing this (or other) header @@ -576,6 +589,7 @@ blocks. n = count of entries inserted d = count of entries dropped +In this example, Base = n - 2 ~~~~~ {: title="Example Dynamic Table Indexing - Post-Base Index in Header Block"} @@ -587,8 +601,11 @@ d = count of entries dropped ### Prefixed Integers The prefixed integer from Section 5.1 of [RFC7541] is used heavily throughout -this document. The format from [RFC7541] is used unmodified. QPACK -implementations MUST be able to decode integers up to 62 bits long. +this document. The format from [RFC7541] is used unmodified. Note, however, +that QPACK uses some prefix sizes not actually used in HPACK. + +QPACK implementations MUST be able to decode integers up to and including 62 +bits long. ### String Literals @@ -604,7 +621,8 @@ table from Appendix B of [RFC7541] is used without modification. This document expands the definition of string literals and permits them to begin other than on a byte boundary. An "N-bit prefix string literal" begins with the same Huffman flag, followed by the length encoded as an (N-1)-bit -prefix integer. The remainder of the string literal is unmodified. +prefix integer. The prefix size, N, can have a value between 2 and 8 inclusive. +The remainder of the string literal is unmodified. A string literal without a prefix length noted is an 8-bit prefix string literal and follows the definitions in [RFC7541] without modification. @@ -628,14 +646,14 @@ type HTTP_STREAM_CREATION_ERROR. These streams MUST NOT be closed. Closure of either unidirectional stream type MUST be treated as a connection error of type HTTP_CLOSED_CRITICAL_STREAM. -An endpoint MAY avoid creating its own encoder stream if it's not going to be -used (for example if the endpoint doesn't wish to use the dynamic table, or if -the maximum size of the dynamic table permitted by the peer is zero). +An endpoint MAY avoid creating an encoder stream if it's not going to be used +(for example if its encoder doesn't wish to use the dynamic table, or if the +maximum size of the dynamic table permitted by the peer is zero). -An endpoint MAY avoid creating its own decoder stream if the maximum size of -its own dynamic table is zero. +An endpoint MAY avoid creating a decoder stream if its decoder sets the maximum +capacity of the dynamic table to zero. -An endpoint MUST allow its peer to create both encoder and decoder streams +An endpoint MUST allow its peer to create an encoder stream and a decoder stream even if the connection's settings prevent their use. ## Encoder Instructions {#encoder-instructions} @@ -654,8 +672,8 @@ This section specifies the following encoder instructions. An encoder informs the decoder of a change to the dynamic table capacity using an instruction which begins with the '001' three-bit pattern. This is followed -by the new dynamic table capacity represented as an integer with a 5-bit prefix -(see {{prefixed-integers}}). +by the new dynamic table capacity represented as an integer with a 5-bit prefix; +see {{prefixed-integers}}. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -667,15 +685,14 @@ by the new dynamic table capacity represented as an integer with a 5-bit prefix The new capacity MUST be lower than or equal to the limit described in {{maximum-dynamic-table-capacity}}. In HTTP/3, this limit is the value of the -SETTINGS_QPACK_MAX_TABLE_CAPACITY parameter (see {{configuration}}) received -from the decoder. The decoder MUST treat a new dynamic table capacity value -that exceeds this limit as a connection error of type -`HTTP_QPACK_ENCODER_STREAM_ERROR`. +SETTINGS_QPACK_MAX_TABLE_CAPACITY parameter ({{configuration}}) received from +the decoder. The decoder MUST treat a new dynamic table capacity value that +exceeds this limit as a connection error of type QPACK_ENCODER_STREAM_ERROR. -Reducing the dynamic table capacity can cause entries to be evicted (see -{{eviction}}). This MUST NOT cause the eviction of blocking entries (see -{{blocked-insertion}}). Changing the capacity of the dynamic table is not -acknowledged as this instruction does not insert an entry. +Reducing the dynamic table capacity can cause entries to be evicted; see +{{eviction}}. This MUST NOT cause the eviction of entries which are not +evictable; see {{blocked-insertion}}. Changing the capacity of the dynamic +table is not acknowledged as this instruction does not insert an entry. ### Insert With Name Reference @@ -683,13 +700,13 @@ An encoder adds an entry to the dynamic table where the header field name matches the header field name of an entry stored in the static or the dynamic table using an instruction that starts with the '1' one-bit pattern. The second ('T') bit indicates whether the reference is to the static or dynamic table. The -6-bit prefix integer (see {{prefixed-integers}}) that follows is used to locate +6-bit prefix integer ({{prefixed-integers}}) that follows is used to locate the table entry for the header name. When T=1, the number represents the static table index; when T=0, the number is the relative index of the entry in the dynamic table. The header name reference is followed by the header field value represented as a -string literal (see {{string-literals}}). +string literal; see {{string-literals}}. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -711,8 +728,8 @@ and the header field value are represented as string literals using an instruction that starts with the '01' two-bit pattern. This is followed by the name represented as a 6-bit prefix string literal, and -the value represented as an 8-bit prefix string literal (see -{{string-literals}}). +the value represented as an 8-bit prefix string literal; see +{{string-literals}}. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -734,7 +751,7 @@ the value represented as an 8-bit prefix string literal (see An encoder duplicates an existing entry in the dynamic table using an instruction that begins with the '000' three-bit pattern. This is followed by the relative index of the existing entry represented as an integer with a 5-bit -prefix (see {{prefixed-integers}}. +prefix; see {{prefixed-integers}}. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -745,19 +762,15 @@ prefix (see {{prefixed-integers}}. {:#fig-index-with-duplication title="Duplicate"} The existing entry is re-inserted into the dynamic table without resending -either the name or the value. This is useful to mitigate the eviction of older -entries which are frequently referenced, both to avoid the need to resend the -header and to avoid the entry in the table blocking the ability to insert new -headers. +either the name or the value. This is useful to avoid adding a reference to an +older entry, which might block inserting new entries. ## Decoder Instructions {#decoder-instructions} -Decoder instructions provide information used to ensure consistency of the -dynamic table. They are sent from the decoder to the encoder on a decoder -stream; that is, the server informs the client about the processing of the -client's header blocks and table updates, and the client informs the server -about the processing of the server's header blocks and table updates. +A decoder sends decoder instructions on the decoder stream to inform the encoder +about the processing of header blocks and table updates to ensure consistency of +the dynamic table. This section specifies the following decoder instructions. @@ -766,8 +779,8 @@ This section specifies the following decoder instructions. After processing a header block whose declared Required Insert Count is not zero, the decoder emits a Header Acknowledgement instruction. The instruction begins with the '1' one-bit pattern which is followed by the header block's -associated stream ID encoded as a 7-bit prefix integer (see -{{prefixed-integers}}). +associated stream ID encoded as a 7-bit prefix integer; see +{{prefixed-integers}}. This instruction is used as described in {{known-received-count}} and in {{state-synchronization}}. @@ -783,7 +796,10 @@ in {{state-synchronization}}. If an encoder receives a Header Acknowledgement instruction referring to a stream on which every header block with a non-zero Required Insert Count has already been acknowledged, that MUST be treated as a connection error of type -`HTTP_QPACK_DECODER_STREAM_ERROR`. +QPACK_DECODER_STREAM_ERROR. + +The Header Acknowledgement instruction might increase the Known Received Count; +see {{known-received-count}}. ### Stream Cancellation @@ -806,13 +822,11 @@ This instruction is used as described in {{state-synchronization}}. ### Insert Count Increment The Insert Count Increment instruction begins with the '00' two-bit pattern, -followed by the Increment encoded as a 6-bit prefix integer. The value of the -Increment is the total number of dynamic table insertions and duplications -processed by the decoder since the last time it sent a Header Acknowledgement -instruction that increased the Known Received Count (see -{{known-received-count}}) or an Insert Count Increment instruction. The encoder -uses this value to update the Known Received Count, as described in -{{state-synchronization}}. +followed by the Increment encoded as a 6-bit prefix integer. This instruction +increases the Known Received Count ({{known-received-count}}) by the value of +the Increment parameter. The decoder should send an Increment value that +increases the Known Received Count to the total number of dynamic table +insertions and duplications processed so far. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -822,9 +836,9 @@ uses this value to update the Known Received Count, as described in ~~~~~~~~~~ {:#fig-size-sync title="Insert Count Increment"} -An encoder that receives an Increment field equal to zero or one that increases +An encoder that receives an Increment field equal to zero, or one that increases the Known Received Count beyond what the encoder has sent MUST treat this as a -connection error of type `HTTP_QPACK_DECODER_STREAM_ERROR`. +connection error of type QPACK_DECODER_STREAM_ERROR. ## Header Block Representations @@ -841,10 +855,8 @@ protocol. Each header block is prefixed with two integers. The Required Insert Count is encoded as an integer with an 8-bit prefix after the encoding described in -{{ric}}). The Base is encoded as sign-and-modulus integer, using a single sign -bit ('S') and a value with a 7-bit prefix (see {{base}}). - -These two values are followed by representations for compressed headers. +{{ric}}). The Base is encoded as a sign bit ('S') and a Delta Base value with a +7-bit prefix; see {{base}}. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -883,14 +895,14 @@ have. The smallest entry has empty name and value strings and has the size of ~~~ `MaxTableCapacity` is the maximum capacity of the dynamic table as specified by -the decoder (see {{maximum-dynamic-table-capacity}}). +the decoder; see {{maximum-dynamic-table-capacity}}. This encoding limits the length of the prefix on long-lived connections. The decoder can reconstruct the Required Insert Count using an algorithm such as the following. If the decoder encounters a value of EncodedInsertCount that could not have been produced by a conformant encoder, it MUST treat this as a -connection error of type `HTTP_QPACK_DECOMPRESSION_FAILED`. +connection error of type QPACK_DECOMPRESSION_FAILED. TotalNumberOfInserts is the total number of inserts into the decoder's dynamic table. @@ -930,11 +942,14 @@ value of 3 indicates that the Required Insert Count is 9 for the header block. The `Base` is used to resolve references in the dynamic table as described in {{relative-indexing}}. -To save space, the Base is encoded relative to the Insert Count using a one-bit -sign ('S') and the `Delta Base` value. A sign bit of 0 indicates that the Base -is greater than or equal to the value of the Insert Count; the value of Delta -Base is added to the Insert Count to determine the value of the Base. A sign -bit of 1 indicates that the Base is less than the Insert Count. That is: +To save space, the Base is encoded relative to the Required Insert Count using a +one-bit sign ('S') and the `Delta Base` value. A sign bit of 0 indicates that +the Base is greater than or equal to the value of the Required Insert Count; the +decoder adds the value of Delta Base to the Required Insert Count to determine +the value of the Base. A sign bit of 1 indicates that the Base is less than the +Required Insert Count; the decoder subtracts the value of Delta Base from the +Required Insert Count and also subtracts one to determine the value of the Base. +That is: ~~~ if S == 0: @@ -952,13 +967,13 @@ entries, the Base will be greater than the Required Insert Count, so the delta will be positive and the sign bit is set to 0. An encoder that produces table updates before encoding a header block might set -Required Insert Count and the Base to the same value. In such case, both the -sign bit and the Delta Base will be set to zero. +Base to the value of Required Insert Count. In such case, both the sign bit and +the Delta Base will be set to zero. A header block that does not reference the dynamic table can use any value for -the Base; setting Delta Base to zero is the most efficient encoding. +the Base; setting Delta Base to zero is one of the most efficient encodings. -For example, with a Required Insert Count of 9, a decoder receives a S bit of 1 +For example, with a Required Insert Count of 9, a decoder receives an S bit of 1 and a Delta Base of 2. This sets the Base to 6 and enables post-base indexing for three entries. In this example, a relative index of 1 refers to the 5th entry that was added to the table; a post-base index of 1 refers to the 8th @@ -967,10 +982,9 @@ entry. ### Indexed Header Field -An indexed header field representation identifies an entry in either the static -table or the dynamic table and causes that header field to be added to the -decoded header list, as described in Section 3.2 of [RFC7541]. - +An indexed header field representation identifies an entry in the static table, +or an entry in the dynamic table with an absolute index less than the value of +the Base. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -980,22 +994,19 @@ decoded header list, as described in Section 3.2 of [RFC7541]. ~~~~~~~~~~ {: title="Indexed Header Field"} -If the entry is in the static table, or in the dynamic table with an absolute -index less than the Base, this representation starts with the '1' 1-bit pattern, -followed by the 'T' bit indicating whether the reference is into the static or -dynamic table. The 6-bit prefix integer (see {{prefixed-integers}}) that -follows is used to locate the table entry for the header field. When T=1, the -number represents the static table index; when T=0, the number is the relative -index of the entry in the dynamic table. +This representation starts with the '1' 1-bit pattern, followed by the 'T' bit +indicating whether the reference is into the static or dynamic table. The 6-bit +prefix integer ({{prefixed-integers}}) that follows is used to locate the +table entry for the header field. When T=1, the number represents the static +table index; when T=0, the number is the relative index of the entry in the +dynamic table. ### Indexed Header Field With Post-Base Index -If the entry is in the dynamic table with an absolute index greater than or -equal to the Base, the representation starts with the '0001' 4-bit pattern, -followed by the post-base index (see {{post-base}}) of the matching header -field, represented as an integer with a 4-bit prefix (see -{{prefixed-integers}}). +An indexed header field with post-base index representation identifies an entry +in the dynamic table with an absolute index greater than or equal to the value +of the Base. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -1005,25 +1016,17 @@ field, represented as an integer with a 4-bit prefix (see ~~~~~~~~~~ {: title="Indexed Header Field with Post-Base Index"} +This representation starts with the '0001' 4-bit pattern. This is followed by +the post-base index ({{post-base}}) of the matching header field, represented as +an integer with a 4-bit prefix; see {{prefixed-integers}}. -### Literal Header Field With Name Reference {#literal-name-reference} - -A literal header field with a name reference represents a header where the -header field name matches the header field name of an entry stored in the static -table or the dynamic table. -If the entry is in the static table, or in the dynamic table with an absolute -index less than the Base, this representation starts with the '01' two-bit -pattern. +### Literal Header Field With Name Reference {#literal-name-reference} -The following bit, 'N', indicates whether an intermediary is permitted to add -this header to the dynamic header table on subsequent hops. When the 'N' bit is -set, the encoded header MUST always be encoded with a literal representation. In -particular, when a peer sends a header field that it received represented as a -literal header field with the 'N' bit set, it MUST use a literal representation -to forward this header field. This bit is intended for protecting header field -values that are not to be put at risk by compressing them (see -{{security-considerations}} for more details). +A literal header field with name reference representation encodes a header field +where the header field name matches the header field name of an entry in the +static table, or the header field name of an entry in the dynamic table with an +absolute index less than the value of the Base. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -1037,31 +1040,33 @@ values that are not to be put at risk by compressing them (see ~~~~~~~~~~ {: title="Literal Header Field With Name Reference"} +This representation starts with the '01' two-bit pattern. The following bit, +'N', indicates whether an intermediary is permitted to add this header to the +dynamic header table on subsequent hops. When the 'N' bit is set, the encoded +header MUST always be encoded with a literal representation. In particular, when +a peer sends a header field that it received represented as a literal header +field with the 'N' bit set, it MUST use a literal representation to forward this +header field. This bit is intended for protecting header field values that are +not to be put at risk by compressing them; see {{security-considerations}} for +more details. + The fourth ('T') bit indicates whether the reference is to the static or dynamic -table. The 4-bit prefix integer (see {{prefixed-integers}}) that follows is +table. The 4-bit prefix integer ({{prefixed-integers}}) that follows is used to locate the table entry for the header name. When T=1, the number represents the static table index; when T=0, the number is the relative index of the entry in the dynamic table. Only the header field name is taken from the dynamic table entry; the header -field value is encoded as an 8-bit prefix string literal (see -{{string-literals}}). +field value is encoded as an 8-bit prefix string literal; see +{{string-literals}}. ### Literal Header Field With Post-Base Name Reference -A literal header field with post-base name reference represents a header field -where the name matches the header field name of a dynamic table entry with an -absolute index greater than or equal to the Base. - -This representation starts with the '0000' four-bit pattern. The fifth bit is -the 'N' bit as described in {{literal-name-reference}}. This is followed by a -post-base index of the dynamic table entry (see {{post-base}}) encoded as an -integer with a 3-bit prefix (see {{prefixed-integers}}). - -Only the header field name is taken from the dynamic table entry; the header -field value is encoded as an 8-bit prefix string literal (see -{{string-literals}}). +A literal header field with post-base name reference representation encodes a +header field where the header field name matches the header field name of a +dynamic table entry with an absolute index greater than or equal to the value of +the Base. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -1075,16 +1080,20 @@ field value is encoded as an 8-bit prefix string literal (see ~~~~~~~~~~ {: title="Literal Header Field With Post-Base Name Reference"} +This representation starts with the '0000' four-bit pattern. The fifth bit is +the 'N' bit as described in {{literal-name-reference}}. This is followed by a +post-base index of the dynamic table entry ({{post-base}}) encoded as an +integer with a 3-bit prefix; see {{prefixed-integers}}. + +Only the header field name is taken from the dynamic table entry; the header +field value is encoded as an 8-bit prefix string literal; see +{{string-literals}}. + ### Literal Header Field Without Name Reference The literal header field without name reference representation encodes a header -field name and header field value as string literals. - -This representation begins with the '001' three-bit pattern. The fourth bit is -the 'N' bit as described in {{literal-name-reference}}. The name follows, -represented as a 4-bit prefix string literal, then the value, represented as an -8-bit prefix string literal (see {{string-literals}}). +field name and a header field value as string literals. ~~~~~~~~~~ drawing 0 1 2 3 4 5 6 7 @@ -1100,13 +1109,18 @@ represented as a 4-bit prefix string literal, then the value, represented as an ~~~~~~~~~~ {: title="Literal Header Field Without Name Reference"} +This representation begins with the '001' three-bit pattern. The fourth bit is +the 'N' bit as described in {{literal-name-reference}}. The name follows, +represented as a 4-bit prefix string literal, then the value, represented as an +8-bit prefix string literal; see {{string-literals}}. + # Configuration QPACK defines two settings which are included in the HTTP/3 SETTINGS frame. SETTINGS_QPACK_MAX_TABLE_CAPACITY (0x1): - : The default value is zero. See {{table-dynamic}} for usage. This is + : The default value is zero. See {{header-table-dynamic}} for usage. This is the equivalent of the SETTINGS_HEADER_TABLE_SIZE from HTTP/2. SETTINGS_QPACK_BLOCKED_STREAMS (0x7): @@ -1118,22 +1132,22 @@ QPACK defines two settings which are included in the HTTP/3 SETTINGS frame. The following error codes are defined for HTTP/3 to indicate failures of QPACK which prevent the connection from continuing: -HTTP_QPACK_DECOMPRESSION_FAILED (0x200): +QPACK_DECOMPRESSION_FAILED (0x200): : The decoder failed to interpret a header block and is not able to continue decoding that header block. -HTTP_QPACK_ENCODER_STREAM_ERROR (0x201): +QPACK_ENCODER_STREAM_ERROR (0x201): : The decoder failed to interpret an encoder instruction received on the encoder stream. -HTTP_QPACK_DECODER_STREAM_ERROR (0x202): +QPACK_DECODER_STREAM_ERROR (0x202): : The encoder failed to interpret a decoder instruction received on the decoder stream. # Security Considerations -TBD. +TBD. Also see Section 7.1 of [RFC7541]. While the negotiated limit on the dynamic table size accounts for much of the memory that can be consumed by a QPACK implementation, data which cannot be @@ -1178,9 +1192,9 @@ are registered in the "HTTP/3 Error Code" registry established in {{HTTP3}}. | --------------------------------- | ----- | ---------------------------------------- | ---------------------- | | Name | Code | Description | Specification | | --------------------------------- | ----- | ---------------------------------------- | ---------------------- | -| HTTP_QPACK_DECOMPRESSION_FAILED | 0x200 | Decompression of a header block failed | {{error-handling}} | -| HTTP_QPACK_ENCODER_STREAM_ERROR | 0x201 | Error on the encoder stream | {{error-handling}} | -| HTTP_QPACK_DECODER_STREAM_ERROR | 0x202 | Error on the decoder stream | {{error-handling}} | +| QPACK_DECOMPRESSION_FAILED | 0x200 | Decompression of a header block failed | {{error-handling}} | +| QPACK_ENCODER_STREAM_ERROR | 0x201 | Error on the encoder stream | {{error-handling}} | +| QPACK_DECODER_STREAM_ERROR | 0x202 | Error on the decoder stream | {{error-handling}} | | --------------------------------- | ----- | ---------------------------------------- | ---------------------- | @@ -1348,6 +1362,18 @@ return controlBuffer, prefixBuffer + streamBuffer > **RFC Editor's Note:** Please remove this section prior to publication of a > final version of this document. +## Since draft-ietf-quic-qpack-12 + +Editorial changes only + +## Since draft-ietf-quic-qpack-11 + +Editorial changes only + +## Since draft-ietf-quic-qpack-10 + +Editorial changes only + ## Since draft-ietf-quic-qpack-09 - Decoders MUST emit Header Acknowledgements (#2939) diff --git a/draft-ietf-quic-recovery.md b/draft-ietf-quic-recovery.md index 4a0ed6eda8..73c6f0b802 100644 --- a/draft-ietf-quic-recovery.md +++ b/draft-ietf-quic-recovery.md @@ -95,11 +95,11 @@ of transport and security experience, and implements mechanisms that make it attractive as a modern general-purpose transport. The QUIC protocol is described in {{QUIC-TRANSPORT}}. -QUIC implements the spirit of existing TCP loss recovery mechanisms, described -in RFCs, various Internet-drafts, and also those prevalent in the Linux TCP -implementation. This document describes QUIC congestion control and loss -recovery, and where applicable, attributes the TCP equivalent in RFCs, -Internet-drafts, academic papers, and/or TCP implementations. +QUIC implements the spirit of existing TCP congestion control and loss recovery +mechanisms, described in RFCs, various Internet-drafts, and also those prevalent +in the Linux TCP implementation. This document describes QUIC congestion +control and loss recovery, and where applicable, attributes the TCP equivalent +in RFCs, Internet-drafts, academic papers, and/or TCP implementations. # Conventions and Definitions @@ -111,35 +111,21 @@ when, and only when, they appear in all capitals, as shown here. Definitions of terms that are used in this document: -ACK-only: - -: Any packet containing only one or more ACK frame(s). - -In-flight: - -: Packets are considered in-flight when they have been sent and - are not ACK-only, and they are not acknowledged, declared lost, - or abandoned along with old keys. - Ack-eliciting Frames: -: All frames besides ACK or PADDING are considered ack-eliciting. +: All frames other than ACK, PADDING, and CONNECTION_CLOSE are considered + ack-eliciting. Ack-eliciting Packets: : Packets that contain ack-eliciting frames elicit an ACK from the receiver within the maximum ack delay and are called ack-eliciting packets. -Crypto Packets: - -: Packets containing CRYPTO data sent in Initial or Handshake - packets. - -Out-of-order Packets: +In-flight: -: Packets that do not increase the largest received packet number for its - packet number space by exactly one. Packets arrive out of order - when earlier packets are lost or delayed. +: Packets are considered in-flight when they are ack-eliciting or contain a + PADDING frame, and they have been sent but are not acknowledged, declared + lost, or abandoned along with old keys. # Design of the QUIC Transmission Machinery @@ -147,8 +133,8 @@ All transmissions in QUIC are sent with a packet-level header, which indicates the encryption level and includes a packet sequence number (referred to below as a packet number). The encryption level indicates the packet number space, as described in {{QUIC-TRANSPORT}}. Packet numbers never repeat within a packet -number space for the lifetime of a connection. Packet numbers monotonically -increase within a space, preventing ambiguity. +number space for the lifetime of a connection. Packet numbers are sent in +monotonically increasing order within a space, preventing ambiguity. This design obviates the need for disambiguating between transmissions and retransmissions and eliminates significant complexity from QUIC's interpretation @@ -167,8 +153,8 @@ of frames contained in a packet affect recovery and congestion control logic: performance of the QUIC handshake and use shorter timers for acknowledgement. -* Packets that contain only ACK frames do not count toward congestion control - limits and are not considered in-flight. +* Packets containing frames besides ACK or CONNECTION_CLOSE frames count toward + congestion control limits and are considered in-flight. * PADDING frames cause packets to contribute toward bytes in flight without directly causing an acknowledgment to be sent. @@ -194,8 +180,8 @@ measurement are unified across packet number spaces. TCP conflates transmission order at the sender with delivery order at the receiver, which results in retransmissions of the same data carrying the same sequence number, and consequently leads to "retransmission ambiguity". QUIC -separates the two: QUIC uses a packet number to indicate transmission order, -and any application data is sent in one or more streams, with delivery order +separates the two. QUIC uses a packet number to indicate transmission order. +Application data is sent in one or more streams and delivery order is determined by stream offsets encoded within STREAM frames. QUIC's packet number is strictly increasing within a packet number space, @@ -216,12 +202,13 @@ not available. ### Clearer Loss Epoch -QUIC ends a loss epoch when a packet sent after loss is declared is -acknowledged. TCP waits for the gap in the sequence number space to be filled, -and so if a segment is lost multiple times in a row, the loss epoch may not -end for several round trips. Because both should reduce their congestion windows -only once per epoch, QUIC will do it correctly once for every round trip that -experiences loss, while TCP may only do it once across multiple round trips. +QUIC starts a loss epoch when a packet is lost and ends one when any packet +sent after the epoch starts is acknowledged. TCP waits for the gap in the +sequence number space to be filled, and so if a segment is lost multiple times +in a row, the loss epoch may not end for several round trips. Because both +should reduce their congestion windows only once per epoch, QUIC will do it +once for every round trip that experiences loss, while TCP may only do it +once across multiple round trips. ### No Reneging @@ -247,11 +234,12 @@ more accurate round-trip time estimate (see Section 13.2 of {{QUIC-TRANSPORT}}). At a high level, an endpoint measures the time from when a packet was sent to when it is acknowledged as a round-trip time (RTT) sample. The endpoint uses RTT samples and peer-reported host delays (see Section 13.2 of -{{QUIC-TRANSPORT}}) to generate a statistical description of the connection's -RTT. An endpoint computes the following three values: the minimum value -observed over the lifetime of the connection (min_rtt), an -exponentially-weighted moving average (smoothed_rtt), and the variance in the -observed RTT samples (rttvar). +{{QUIC-TRANSPORT}}) to generate a statistical description of the network +path's RTT. An endpoint computes the following three values for each path: +the minimum value observed over the lifetime of the path (min_rtt), an +exponentially-weighted moving average (smoothed_rtt), and the mean deviation +(referred to as "variation" in the rest of this document) in the observed RTT +samples (rttvar). ## Generating RTT samples {#latest-rtt} @@ -270,19 +258,20 @@ latest_rtt = ack_time - send_time_of_largest_acked ~~~ An RTT sample is generated using only the largest acknowledged packet in the -received ACK frame. This is because a peer reports host delays for only the -largest acknowledged packet in an ACK frame. While the reported host delay is +received ACK frame. This is because a peer reports ACK delays for only the +largest acknowledged packet in an ACK frame. While the reported ACK delay is not used by the RTT sample measurement, it is used to adjust the RTT sample in subsequent computations of smoothed_rtt and rttvar {{smoothed-rtt}}. -To avoid generating multiple RTT samples using the same packet, an ACK frame +To avoid generating multiple RTT samples for a single packet, an ACK frame SHOULD NOT be used to update RTT estimates if it does not newly acknowledge the largest acknowledged packet. An RTT sample MUST NOT be generated on receiving an ACK frame that does not -newly acknowledge at least one ack-eliciting packet. A peer does not send an -ACK frame on receiving only non-ack-eliciting packets, so an ACK frame that is -subsequently sent can include an arbitrarily large Ack Delay field. Ignoring +newly acknowledge at least one ack-eliciting packet. A peer usually does not +send an ACK frame when only non-ack-eliciting packets are received. Therefore +an ACK frame that contains acknowledgements for only non-ack-eliciting packets +could include an arbitrarily large Ack Delay value. Ignoring such ACK frames avoids complications in subsequent smoothed_rtt and rttvar computations. @@ -293,27 +282,37 @@ retain sufficient history is an open research question. ## Estimating min_rtt {#min-rtt} -min_rtt is the minimum RTT observed over the lifetime of the connection. -min_rtt is set to the latest_rtt on the first sample in a connection, and to the -lesser of min_rtt and latest_rtt on subsequent samples. +min_rtt is the minimum RTT observed for a given network path. min_rtt is set +to the latest_rtt on the first RTT sample, and to the lesser of min_rtt and +latest_rtt on subsequent samples. In this document, min_rtt is used by loss +detection to reject implausibly small rtt samples. An endpoint uses only locally observed times in computing the min_rtt and does -not adjust for host delays reported by the peer. Doing so allows the endpoint +not adjust for ACK delays reported by the peer. Doing so allows the endpoint to set a lower bound for the smoothed_rtt based entirely on what it observes (see {{smoothed-rtt}}), and limits potential underestimation due to erroneously-reported delays by the peer. +The RTT for a network path may change over time. If a path's actual RTT +decreases, the min_rtt will adapt immediately on the first low sample. If +the path's actual RTT increases, the min_rtt will not adapt to it, allowing +future RTT samples that are smaller than the new RTT be included in +smoothed_rtt. + ## Estimating smoothed_rtt and rttvar {#smoothed-rtt} smoothed_rtt is an exponentially-weighted moving average of an endpoint's RTT -samples, and rttvar is the endpoint's estimated variance in the RTT samples. +samples, and rttvar is the variation in the RTT samples, estimated using a +mean variation. The calculation of smoothed_rtt uses path latency after adjusting RTT samples -for host delays. For packets sent in the ApplicationData packet number space, -a peer limits any delay in sending an acknowledgement for an ack-eliciting -packet to no greater than the value it advertised in the max_ack_delay transport -parameter. Consequently, when a peer reports an Ack Delay that is greater than -its max_ack_delay, the delay is attributed to reasons out of the peer's control, +for acknowledgement delays. These delays are computed using the ACK Delay +field of the ACK frame as described in Section 19.3 of {{QUIC-TRANSPORT}}. +For packets sent in the ApplicationData packet number space, a peer limits +any delay in sending an acknowledgement for an ack-eliciting packet to no +greater than the value it advertised in the max_ack_delay transport parameter. +Consequently, when a peer reports an Ack Delay that is greater than its +max_ack_delay, the delay is attributed to reasons out of the peer's control, such as scheduler latency at the peer or loss of previous ACK frames. Any delays beyond the peer's max_ack_delay are therefore considered effectively part of path delay and incorporated into the smoothed_rtt estimate. @@ -331,11 +330,11 @@ endpoint: min_rtt. This limits the underestimation that a misreporting peer can cause to the smoothed_rtt. -On the first RTT sample in a connection, the smoothed_rtt is set to the +On the first RTT sample for a network path, the smoothed_rtt is set to the latest_rtt. smoothed_rtt and rttvar are computed as follows, similar to {{?RFC6298}}. On -the first RTT sample in a connection: +the first RTT sample for a network path: ~~~ smoothed_rtt = latest_rtt @@ -357,8 +356,9 @@ rttvar = 3/4 * rttvar + 1/4 * rttvar_sample # Loss Detection {#loss-detection} -QUIC senders use both ack information and timeouts to detect lost packets, and -this section provides a description of these algorithms. +QUIC senders use acknowledgements to detect lost packets, and a probe +time out (see {{pto}}) to ensure acknowledgements are received. This section +provides a description of these algorithms. If a packet is lost, the QUIC transport needs to recover from that loss, such as by retransmitting the data, sending an updated frame, or abandoning the @@ -381,7 +381,7 @@ A packet is declared lost if it meets all the following conditions: packet ({{packet-threshold}}), or it was sent long enough in the past ({{time-threshold}}). -The acknowledgement indicates that a packet sent later was delivered, while the +The acknowledgement indicates that a packet sent later was delivered, and the packet and time thresholds provide some tolerance for packet reordering. Spuriously declaring packets as lost leads to unnecessary retransmissions and @@ -395,7 +395,8 @@ latency. The RECOMMENDED initial value for the packet reordering threshold (kPacketThreshold) is 3, based on best practices for TCP loss detection -{{?RFC5681}} {{?RFC6675}}. +{{?RFC5681}} {{?RFC6675}}. Implementations SHOULD NOT use a packet threshold +less than 3, to keep in line with TCP {{?RFC5681}}. Some networks may exhibit higher degrees of reordering, causing a sender to detect spurious losses. Implementers MAY use algorithms developed for TCP, such @@ -403,14 +404,13 @@ as TCP-NCR {{?RFC4653}}, to improve QUIC's reordering resilience. ### Time Threshold {#time-threshold} -Once a later packet packet within the same packet number space has been -acknowledged, an endpoint SHOULD declare an earlier packet lost if it was sent -a threshold amount of time in the past. To avoid declaring packets as lost too -early, this time threshold MUST be set to at least kGranularity. The time -threshold is: +Once a later packet within the same packet number space has been acknowledged, +an endpoint SHOULD declare an earlier packet lost if it was sent a threshold +amount of time in the past. To avoid declaring packets as lost too early, this +time threshold MUST be set to at least kGranularity. The time threshold is: ~~~ -kTimeThreshold * max(smoothed_rtt, latest_rtt, kGranularity) +max(kTimeThreshold * max(smoothed_rtt, latest_rtt), kGranularity) ~~~ If packets sent prior to the largest acknowledged packet cannot yet be declared @@ -429,7 +429,7 @@ The RECOMMENDED time threshold (kTimeThreshold), expressed as a round-trip time multiplier, is 9/8. Implementations MAY experiment with absolute thresholds, thresholds from -previous connections, adaptive thresholds, or including RTT variance. Smaller +previous connections, adaptive thresholds, or including RTT variation. Smaller thresholds reduce reordering resilience and increase spurious retransmissions, and larger thresholds increase loss detection delay. @@ -439,11 +439,13 @@ and larger thresholds increase loss detection delay. A Probe Timeout (PTO) triggers sending one or two probe datagrams when ack-eliciting packets are not acknowledged within the expected period of time or the handshake has not been completed. A PTO enables a connection to -recover from loss of tail packets or acknowledgements. The PTO algorithm used -in QUIC implements the reliability functions of Tail Loss Probe -{{?TLP=I-D.dukkipati-tcpm-tcp-loss-probe}} {{?RACK}}, RTO {{?RFC5681}} and -F-RTO algorithms for TCP {{?RFC5682}}, and the timeout computation is based on -TCP's retransmission timeout period {{?RFC6298}}. +recover from loss of tail packets or acknowledgements. + +As with loss detection, the probe timeout is per packet number space. +The PTO algorithm used in QUIC implements the reliability functions of +Tail Loss Probe {{?RACK}}, RTO {{?RFC5681}}, and F-RTO algorithms for +TCP {{?RFC5682}}. The timeout computation is based on TCP's retransmission +timeout period {{?RFC6298}}. ### Computing PTO @@ -459,25 +461,37 @@ kGranularity, smoothed_rtt, rttvar, and max_ack_delay are defined in The PTO period is the amount of time that a sender ought to wait for an acknowledgement of a sent packet. This time period includes the estimated -network roundtrip-time (smoothed_rtt), the variance in the estimate (4*rttvar), +network roundtrip-time (smoothed_rtt), the variation in the estimate (4*rttvar), and max_ack_delay, to account for the maximum time by which a receiver might -delay sending an acknowledgement. +delay sending an acknowledgement. When the PTO is armed for Initial or +Handshake packet number spaces, the max_ack_delay is 0, as specified in +13.2.1 of {{QUIC-TRANSPORT}}. The PTO value MUST be set to at least kGranularity, to avoid the timer expiring immediately. +A sender computes its PTO timer every time an ack-eliciting packet is sent. +When ack-eliciting packets are in-flight in multiple packet number spaces, +the timer MUST be set for the packet number space with the earliest timeout, +except for ApplicationData, which MUST be ignored until the handshake +completes; see Section 4.1.1 of {{QUIC-TLS}}. Not arming the PTO for +ApplicationData prioritizes completing the handshake and prevents the server +from sending a 1-RTT packet on a PTO before before it has the keys to process +a 1-RTT packet. + When a PTO timer expires, the PTO period MUST be set to twice its current value. This exponential reduction in the sender's rate is important because -the PTOs might be caused by loss of packets or acknowledgements due to severe -congestion. The life of a connection that is experiencing consecutive PTOs is -limited by the endpoint's idle timeout. +consecutive PTOs might be caused by loss of packets or acknowledgements due to +severe congestion. Even when there are ack-eliciting packets in-flight in +multiple packet number spaces, the exponential increase in probe timeout +occurs across all spaces to prevent excess load on the network. For example, +a timeout in the Initial packet number space doubles the length of the timeout +in the Handshake packet number space. -A sender computes its PTO timer every time an ack-eliciting packet is sent. A -sender might choose to optimize this by setting the timer fewer times if it -knows that more ack-eliciting packets will be sent within a short period of -time. +The life of a connection that is experiencing consecutive PTOs is limited by +the endpoint's idle timeout. -The probe timer is not set if the time threshold {{time-threshold}} loss +The probe timer MUST NOT be set if the time threshold {{time-threshold}} loss detection timer is set. The time threshold loss detection timer is expected to both expire earlier than the PTO and be less likely to spuriously retransmit data. @@ -491,16 +505,16 @@ connection's initial RTT. If no previous RTT is available, the initial RTT SHOULD be set to 500ms, resulting in a 1 second initial timeout as recommended in {{?RFC6298}}. -A connection MAY use the delay between sending a PATH_CHALLENGE and receiving -a PATH_RESPONSE to seed initial_rtt for a new path, but the delay SHOULD NOT -be considered an RTT sample. +A connection MAY use the delay between sending a PATH_CHALLENGE and receiving a +PATH_RESPONSE to set the initial RTT (see kInitialRtt in +{{ld-consts-of-interest}}) for a new path, but the delay SHOULD NOT be +considered an RTT sample. Until the server has validated the client's address on the path, the amount of -data it can send is limited, as specified in Section 8.1 of {{QUIC-TRANSPORT}}. -Data at Initial encryption MUST be retransmitted before Handshake data and -data at Handshake encryption MUST be retransmitted before any ApplicationData -data. If no data can be sent, then the PTO alarm MUST NOT be armed until -data has been received from the client. +data it can send is limited to three times the amount of data received, +as specified in Section 8.1 of {{QUIC-TRANSPORT}}. If no data can be sent, +then the PTO alarm MUST NOT be armed until datagrams have been received from +the client. Since the server could be blocked until more packets are received from the client, it is the client's responsibility to send packets to unblock the server @@ -517,25 +531,30 @@ keys are available to the client, it MUST send a Handshake packet, and otherwise it MUST send an Initial packet in a UDP datagram of at least 1200 bytes. -Initial packets and Handshake packets may never be acknowledged, but they are +Initial packets and Handshake packets could be never acknowledged, but they are removed from bytes in flight when the Initial and Handshake keys are discarded. ### Sending Probe Packets When a PTO timer expires, a sender MUST send at least one ack-eliciting packet -as a probe, unless there is no data available to send. An endpoint MAY send up -to two full-sized datagrams containing ack-eliciting packets, to avoid an -expensive consecutive PTO expiration due to a single lost datagram. +in the packet number space as a probe, unless there is no data available to +send. An endpoint MAY send up to two full-sized datagrams containing +ack-eliciting packets, to avoid an expensive consecutive PTO expiration due +to a single lost datagram or transmit data from multiple packet number spaces. + +In addition to sending data in the packet number space for which the timer +expired, the sender SHOULD send ack-eliciting packets from other packet +number spaces with in-flight data, coalescing packets if possible. -It is possible that the sender has no new or previously-sent data to send. As -an example, consider the following sequence of events: new application data is -sent in a STREAM frame, deemed lost, then retransmitted in a new packet, and -then the original transmission is acknowledged. In the absence of any new -application data, a PTO timer expiration now would find the sender with no new -or previously-sent data to send. +When the PTO timer expires, and there is new or previously sent unacknowledged +data, it MUST be sent. -When there is no data to send, the sender SHOULD send a PING or other -ack-eliciting frame in a single packet, re-arming the PTO timer. +It is possible the sender has no new or previously-sent data to send. +As an example, consider the following sequence of events: new application data +is sent in a STREAM frame, deemed lost, then retransmitted in a new packet, +and then the original transmission is acknowledged. When there is no data to +send, the sender SHOULD send a PING or other ack-eliciting frame in a single +packet, re-arming the PTO timer. Alternatively, instead of sending an ack-eliciting packet, the sender MAY mark any packets still in flight as lost. Doing so avoids sending an additional @@ -551,7 +570,7 @@ Probe packets sent on a PTO MUST be ack-eliciting. A probe packet SHOULD carry new data when possible. A probe packet MAY carry retransmitted unacknowledged data when new data is unavailable, when flow control does not permit new data to be sent, or to opportunistically reduce loss recovery delay. Implementations -MAY use alternate strategies for determining the content of probe packets, +MAY use alternative strategies for determining the content of probe packets, including sending new or retransmitted data based on the application's priorities. @@ -572,23 +591,28 @@ prior unacknowledged packets to be marked as lost. When an acknowledgement is received that newly acknowledges packets, loss detection proceeds as dictated by packet and time threshold mechanisms; see {{ack-loss-detection}}. -## Retry and Version Negotiation +## Handling Retry Packets -A Retry or Version Negotiation packet causes a client to send another Initial -packet, effectively restarting the connection process and resetting congestion -control and loss recovery state, including resetting any pending timers. Either -packet indicates that the Initial was received but not processed. Neither -packet can be treated as an acknowledgment for the Initial. +A Retry packet causes a client to send another Initial packet, effectively +restarting the connection process. A Retry packet indicates that the Initial +was received, but not processed. A Retry packet cannot be treated as an +acknowledgment, because it does not indicate that a packet was processed or +specify the packet number. -The client MAY however compute an RTT estimate to the server as the time period -from when the first Initial was sent to when a Retry or a Version Negotiation -packet is received. The client MAY use this value to seed the RTT estimator for -a subsequent connection attempt to the server. +Clients that receive a Retry packet reset congestion control and loss recovery +state, including resetting any pending timers. Other connection state, in +particular cryptographic handshake messages, is retained; see Section 17.2.5 of +{{QUIC-TRANSPORT}}. + +The client MAY compute an RTT estimate to the server as the time period from +when the first Initial was sent to when a Retry or a Version Negotiation packet +is received. The client MAY use this value in place of its default for the +initial RTT estimate. ## Discarding Keys and Packet State {#discarding-packets} -When packet protection keys are discarded (see Section 4.9 of {{QUIC-TLS}}), all -packets that were sent with those keys can no longer be acknowledged because +When packet protection keys are discarded (see Section 4.10 of {{QUIC-TLS}}), +all packets that were sent with those keys can no longer be acknowledged because their acknowledgements cannot be processed anymore. The sender MUST discard all recovery state associated with those packets and MUST remove them from the count of bytes in flight. @@ -606,48 +630,39 @@ is expected to be infrequent. It is expected that keys are discarded after packets encrypted with them would be acknowledged or declared lost. Initial secrets however might be destroyed -sooner, as soon as handshake keys are available (see Section 4.9.1 of +sooner, as soon as handshake keys are available (see Section 4.10.1 of {{QUIC-TLS}}). -## Discussion - -The majority of constants were derived from best common practices among widely -deployed TCP implementations on the internet. Exceptions follow. +# Congestion Control {#congestion-control} -A shorter delayed ack time of 25ms was chosen because longer delayed acks can -delay loss recovery and for the small number of connections where less than -packet per 25ms is delivered, acking every packet is beneficial to congestion -control and loss recovery. +This document specifies a Reno congestion controller for QUIC {{?RFC6582}}. -# Congestion Control {#congestion-control} +The signals QUIC provides for congestion control are generic and are designed to +support different algorithms. Endpoints can unilaterally choose a different +algorithm to use, such as Cubic {{?RFC8312}}. -QUIC's congestion control is based on TCP NewReno {{?RFC6582}}. NewReno is a -congestion window based congestion control. QUIC specifies the congestion -window in bytes rather than packets due to finer control and the ease of -appropriate byte counting {{?RFC3465}}. +If an endpoint uses a different controller than that specified in this document, +the chosen controller MUST conform to the congestion control guidelines +specified in Section 3.1 of {{!RFC8085}}. -QUIC hosts MUST NOT send packets if they would increase bytes_in_flight (defined -in {{vars-of-interest}}) beyond the available congestion window, unless the -packet is a probe packet sent after a PTO timer expires, as described in -{{pto}}. +The algorithm in this document specifies and uses the controller's congestion +window in bytes. -Implementations MAY use other congestion control algorithms, such as -Cubic {{?RFC8312}}, and endpoints MAY use different algorithms from one another. -The signals QUIC provides for congestion control are generic and are designed -to support different algorithms. +An endpoint MUST NOT send a packet if it would cause bytes_in_flight (see +{{vars-of-interest}}) to be larger than the congestion window, unless the packet +is sent on a PTO timer expiration (see {{pto}}). ## Explicit Congestion Notification {#congestion-ecn} -If a path has been verified to support ECN, QUIC treats a Congestion Experienced -codepoint in the IP header as a signal of congestion. This document specifies an -endpoint's response when its peer receives packets with the Congestion -Experienced codepoint. As discussed in {{!RFC8311}}, endpoints are permitted to -experiment with other response functions. +If a path has been verified to support ECN {{?RFC3168}} {{?RFC8311}}, QUIC +treats a Congestion Experienced(CE) codepoint in the IP header as a signal of +congestion. This document specifies an endpoint's response when its peer +receives packets with the Congestion Experienced codepoint. ## Slow Start QUIC begins every connection in slow start and exits slow start upon loss or -upon increase in the ECN-CE counter. QUIC re-enters slow start anytime the +upon increase in the ECN-CE counter. QUIC re-enters slow start any time the congestion window is less than ssthresh, which only occurs after persistent congestion is declared. While in slow start, QUIC increases the congestion window by the number of bytes acknowledged when each acknowledgment is @@ -664,24 +679,23 @@ congestion window. ## Recovery Period -Recovery is a period of time beginning with detection of a lost packet or an -increase in the ECN-CE counter. Because QUIC does not retransmit packets, -it defines the end of recovery as a packet sent after the start of recovery -being acknowledged. This is slightly different from TCP's definition of -recovery, which ends when the lost packet that started recovery is acknowledged. +A recovery period is entered when loss or ECN-CE marking of a packet is +detected. A recovery period ends when a packet sent during the recovery period +is acknowledged. This is slightly different from TCP's definition of recovery, +which ends when the lost packet that started recovery is acknowledged. The recovery period limits congestion window reduction to once per round trip. During recovery, the congestion window remains unchanged irrespective of new losses or increases in the ECN-CE counter. -## Ignoring Loss of Undecryptable Packets +## Ignoring Loss of Undecryptable Packets -During the handshake, some packet protection keys might not be -available when a packet arrives. In particular, Handshake and 0-RTT packets -cannot be processed until the Initial packets arrive, and 1-RTT packets -cannot be processed until the handshake completes. Endpoints MAY -ignore the loss of Handshake, 0-RTT, and 1-RTT packets that might arrive before -the peer has packet protection keys to process those packets. +During the handshake, some packet protection keys might not be +available when a packet arrives. In particular, Handshake and 0-RTT packets +cannot be processed until the Initial packets arrive, and 1-RTT packets +cannot be processed until the handshake completes. Endpoints MAY +ignore the loss of Handshake, 0-RTT, and 1-RTT packets that might arrive before +the peer has packet protection keys to process those packets. ## Probe Timeout @@ -723,19 +737,19 @@ illustrate persistent congestion: t=7 | Send Pkt #4 (PTO 3) t=8 | Recv ACK of Pkt #4 -The first three packets are determined to be lost when the ACK of packet 4 is -received at t=8. The congestion period is calculated as the time between the -oldest and newest lost packets: (3 - 0) = 3. The duration for persistent -congestion is equal to: (1 * kPersistentCongestionThreshold) = 3. Because the -threshold was reached and because none of the packets between the oldest and the -newest packets are acknowledged, the network is considered to have experienced -persistent congestion. +The first three packets are determined to be lost when the acknowlegement of +packet 4 is received at t=8. The congestion period is calculated as the time +between the oldest and newest lost packets: (3 - 0) = 3. The duration for +persistent congestion is equal to: (1 * kPersistentCongestionThreshold) = 3. +Because the threshold was reached and because none of the packets between the +oldest and the newest packets are acknowledged, the network is considered to +have experienced persistent congestion. When persistent congestion is established, the sender's congestion window MUST be reduced to the minimum congestion window (kMinimumWindow). This response of collapsing the congestion window on persistent congestion is functionally similar to a sender's response on a Retransmission Timeout (RTO) in TCP -{{RFC5681}} after Tail Loss Probes (TLP) {{TLP}}. +{{RFC5681}} after Tail Loss Probes (TLP) {{RACK}}. ## Pacing {#pacing} @@ -753,6 +767,14 @@ delivery of ACK frames is important for efficient loss recovery. Packets containing only ACK frames should therefore not be paced, to avoid delaying their delivery to the peer. +Sending multiple packets into the network without any delay between them +creates a packet burst that might cause short-term congestion and losses. +Implementations MUST either use pacing or limit such bursts to the initial +congestion window, which is recommended to be the minimum of +10 * max_datagram_size and max(2* max_datagram_size, 14720)), where +max_datagram_size is the current maximum size of a datagram for the connection, +not including UDP or IP overhead. + As an example of a well-known and publicly available implementation of a flow pacer, implementers are referred to the Fair Queue packet scheduler (fq qdisc) in Linux (3.11 onwards). @@ -773,14 +795,7 @@ and not fully utilize the congestion window due to this delay. A sender should not consider itself application limited if it would have fully utilized the congestion window without pacing delay. -Sending multiple packets into the network without any delay between them -creates a packet burst that might cause short-term congestion and losses. -Implementations SHOULD either use pacing or reduce their congestion window -to limit such bursts to minimum of 10 * kMaxDatagramSize and -max(2* kMaxDatagramSize, 14720)), the same as the recommended initial -congestion window. - -A sender MAY implement alternate mechanisms to update its congestion window +A sender MAY implement alternative mechanisms to update its congestion window after periods of under-utilization, such as those proposed for TCP in {{?RFC7661}}. @@ -808,9 +823,9 @@ sender. Suppressing reports of ECN-CE markings could cause a sender to increase their send rate. This increase could result in congestion and loss. A sender MAY attempt to detect suppression of reports by marking occasional -packets that they send with ECN-CE. If a packet marked with ECN-CE is not -reported as having been marked when the packet is acknowledged, the sender -SHOULD then disable ECN for that path. +packets that they send with ECN-CE. If a packet sent with ECN-CE is not +reported as having been CE marked when the packet is acknowledged, then the +sender SHOULD disable ECN for that path. Reporting additional ECN-CE markings will cause a sender to reduce their sending rate, which is similar in effect to advertising reduced connection flow control @@ -876,8 +891,7 @@ time_sent: ## Constants of interest {#ld-consts-of-interest} Constants used in loss recovery are based on a combination of RFCs, papers, and -common practice. Some may need to be changed or negotiated in order to better -suit a variety of environments. +common practice. kPacketThreshold: : Maximum reordering in packets before packet threshold loss detection @@ -922,7 +936,7 @@ smoothed_rtt: {{?RFC6298}} rttvar: -: The RTT variance, computed as described in {{?RFC6298}} +: The RTT variation, computed as described in {{?RFC6298}} min_rtt: : The minimum RTT seen in the connection, ignoring ack delay. @@ -931,7 +945,7 @@ max_ack_delay: : The maximum amount of time by which the receiver intends to delay acknowledgments for packets in the ApplicationData packet number space. The actual ack_delay in a received ACK frame may be larger due to late timers, - reordering, or lost ACKs. + reordering, or lost ACK frames. loss_detection_timer: : Multi-modal timer used for loss detection. @@ -939,7 +953,7 @@ loss_detection_timer: pto_count: : The number of times a PTO has been sent without receiving an ack. -time_of_last_sent_ack_eliciting_packet: +time_of_last_sent_ack_eliciting_packet\[kPacketNumberSpace]: : The time the most recent ack-eliciting packet was sent. largest_acked_packet\[kPacketNumberSpace]: @@ -967,9 +981,9 @@ follows: rttvar = 0 min_rtt = 0 max_ack_delay = 0 - time_of_last_sent_ack_eliciting_packet = 0 for pn_space in [ Initial, Handshake, ApplicationData ]: largest_acked_packet[pn_space] = infinite + time_of_last_sent_ack_eliciting_packet[pn_space] = 0 loss_time[pn_space] = 0 ~~~ @@ -992,7 +1006,7 @@ Pseudocode for OnPacketSent follows: sent_packets[pn_space][packet_number].in_flight = in_flight if (in_flight): if (ack_eliciting): - time_of_last_sent_ack_eliciting_packet = now + time_of_last_sent_ack_eliciting_packet[pn_space] = now OnPacketSentCC(sent_bytes) sent_packets[pn_space][packet_number].size = sent_bytes SetLossDetectionTimer() @@ -1099,15 +1113,16 @@ timers wake up late. Timers set in the past SHOULD fire immediately. Pseudocode for SetLossDetectionTimer follows: ~~~ -// Returns the earliest loss_time and the packet number -// space it's from. Returns 0 if all times are 0. -GetEarliestLossTime(): - time = loss_time[Initial] +GetEarliestTimeAndSpace(times): + time = times[Initial] space = Initial for pn_space in [ Handshake, ApplicationData ]: - if (loss_time[pn_space] != 0 && - (time == 0 || loss_time[pn_space] < time)): - time = loss_time[pn_space]; + if (times[pn_space] != 0 && + (time == 0 || times[pn_space] < time) && + # Skip ApplicationData until handshake completion. + (pn_space != ApplicationData || + IsHandshakeComplete()): + time = times[pn_space]; space = pn_space return time, space @@ -1121,10 +1136,10 @@ PeerNotAwaitingAddressValidation(): has received 1-RTT ACK SetLossDetectionTimer(): - loss_time, _ = GetEarliestLossTime() - if (loss_time != 0): + earliest_loss_time, _ = GetEarliestTimeAndSpace(loss_time) + if (earliest_loss_time != 0): // Time threshold loss detection. - loss_detection_timer.update(loss_time) + loss_detection_timer.update(earliest_loss_time) return if (no ack-eliciting packets in flight && @@ -1141,8 +1156,9 @@ SetLossDetectionTimer(): max_ack_delay timeout = timeout * (2 ^ pto_count) - loss_detection_timer.update( - time_of_last_sent_ack_eliciting_packet + timeout) + sent_time, _ = GetEarliestTimeAndSpace( + time_of_last_sent_ack_eliciting_packet) + loss_detection_timer.update(sent_time + timeout) ~~~ @@ -1155,8 +1171,9 @@ Pseudocode for OnLossDetectionTimeout follows: ~~~ OnLossDetectionTimeout(): - loss_time, pn_space = GetEarliestLossTime() - if (loss_time != 0): + earliest_loss_time, pn_space = + GetEarliestTimeAndSpace(loss_time) + if (earliest_loss_time != 0): // Time threshold loss Detection DetectLostPackets(pn_space) SetLossDetectionTimer() @@ -1173,7 +1190,9 @@ OnLossDetectionTimeout(): else: // PTO. Send new data if available, else retransmit old data. // If neither is available, send a single PING frame. - SendOneOrTwoAckElicitingPackets() + _, pn_space = GetEarliestTimeAndSpace( + time_of_last_sent_ack_eliciting_packet) + SendOneOrTwoAckElicitingPackets(pn_space) pto_count++ SetLossDetectionTimer() @@ -1232,24 +1251,20 @@ in {{congestion-control}}. ## Constants of interest {#cc-consts-of-interest} -Constants used in congestion control are based on a combination of RFCs, -papers, and common practice. Some may need to be changed or negotiated -in order to better suit a variety of environments. - -kMaxDatagramSize: -: The sender's maximum payload size. Does not include UDP or IP overhead. The - max packet size is used for calculating initial and minimum congestion - windows. The RECOMMENDED value is 1200 bytes. +Constants used in congestion control are based on a combination of RFCs, papers, +and common practice. kInitialWindow: -: Default limit on the initial amount of data in flight, in bytes. Taken from - {{?RFC6928}}, but increased slightly to account for the smaller 8 byte - overhead of UDP vs 20 bytes for TCP. The RECOMMENDED value is the minimum - of 10 * kMaxDatagramSize and max(2* kMaxDatagramSize, 14720)). +: Default limit on the initial amount of data in flight, in bytes. + The RECOMMENDED value is the minimum of 10 * max_datagram_size and + max(2 * max_datagram_size, 14720)). This follows the analysis and + recommendations in {{?RFC6928}}, increasing the byte limit to account + for the smaller 8 byte overhead of UDP compared to the 20 byte overhead + for TCP. kMinimumWindow: : Minimum congestion window in bytes. The RECOMMENDED value is - 2 * kMaxDatagramSize. + 2 * max_datagram_size. kLossReductionFactor: : Reduction in congestion window when a new loss event is detected. @@ -1259,8 +1274,8 @@ kPersistentCongestionThreshold: : Period of time for persistent congestion to be established, specified as a PTO multiplier. The rationale for this threshold is to enable a sender to use initial PTOs for aggressive probing, as TCP does with Tail Loss Probe (TLP) - {{TLP}} {{RACK}}, before establishing persistent congestion, as TCP does with - a Retransmission Timeout (RTO) {{?RFC5681}}. The RECOMMENDED value for + {{RACK}}, before establishing persistent congestion, as TCP does with a + Retransmission Timeout (RTO) {{?RFC5681}}. The RECOMMENDED value for kPersistentCongestionThreshold is 3, which is approximately equivalent to having two TLPs before an RTO in TCP. @@ -1270,6 +1285,13 @@ kPersistentCongestionThreshold: Variables required to implement the congestion control mechanisms are described in this section. +max_datagram_size: +: The sender's current maximum payload size. Does not include UDP or IP + overhead. The max datagram size is used for congestion window + computations. An endpoint sets the value of this variable based on its + PMTU (see Section 14.1 of {{QUIC-TRANSPORT}}), with a minimum value of + 1200 bytes. + ecn_ce_counters\[kPacketNumberSpace]: : The highest value reported for the ECN-CE counter in the packet number space by the peer in an ACK frame. This value is used to detect increases in the @@ -1338,16 +1360,16 @@ acked_packet from sent_packets. if (InCongestionRecovery(acked_packet.time_sent)): // Do not increase congestion window in recovery period. return - if (IsAppLimited()): + if (IsAppOrFlowControlLimited()): // Do not increase congestion_window if application - // limited. + // limited or flow control limited. return if (congestion_window < ssthresh): // Slow start. congestion_window += acked_packet.size else: // Congestion avoidance. - congestion_window += kMaxDatagramSize * acked_packet.size + congestion_window += max_datagram_size * acked_packet.size / congestion_window ~~~ @@ -1419,6 +1441,25 @@ Invoked from DetectLostPackets when packets are deemed lost. Issue and pull request numbers are listed with a leading octothorp. +## Since draft-ietf-quic-recovery-25 + +No significant changes. + +## Since draft-ietf-quic-recovery-24 + +- Require congestion control of some sort (#3247, #3244, #3248) +- Set a minimum reordering threshold (#3256, #3240) +- PTO is specific to a packet number space (#3067, #3074, #3066) + +## Since draft-ietf-quic-recovery-23 + +- Define under-utilizing the congestion window (#2630, #2686, #2675) +- PTO MUST send data if possible (#3056, #3057) +- Connection Close is not ack-eliciting (#3097, #3098) +- MUST limit bursts to the initial congestion window (#3160) +- Define the current max_datagram_size for congestion control + (#3041, #3167) + ## Since draft-ietf-quic-recovery-22 - PTO should always send an ack-eliciting packet (#2895) @@ -1493,7 +1534,7 @@ Issue and pull request numbers are listed with a leading octothorp. - Disable RTT calculation for packets that don't elicit acknowledgment (#2060, #2078) - Limit ack_delay by max_ack_delay (#2060, #2099) -- Initial keys are discarded once Handshake are avaialble (#1951, #2045) +- Initial keys are discarded once Handshake keys are available (#1951, #2045) - Reorder ECN and loss detection in pseudocode (#2142) - Only cancel loss detection timer if ack-eliciting packets are in flight (#2093, #2117) @@ -1592,5 +1633,22 @@ No significant changes. - Added table of contents +# Contributors + +The IETF QUIC Working Group received an enormous amount of support from many +people. The following people provided substantive contributions to this +document: +Alessandro Ghedini, +Benjamin Saunders, +Gorry Fairhurst, , +Lars Eggert, +Magnus Westerlund, +Marten Seemann, +Martin Duke, +Martin Thomson, +Nick Banks, +Praveen Balasubramaniam. + # Acknowledgments {:numbered="false"} diff --git a/draft-ietf-quic-tls.md b/draft-ietf-quic-tls.md index d93f1702f0..e06c6f34a2 100644 --- a/draft-ietf-quic-tls.md +++ b/draft-ietf-quic-tls.md @@ -80,7 +80,7 @@ informative: ISBN: 978-1466570269 QUIC-HTTP: - title: "Hypertext Transfer Protocol (HTTP) over QUIC" + title: "Hypertext Transfer Protocol Version 3 (HTTP/3)" date: {DATE} seriesinfo: Internet-Draft: draft-ietf-quic-http-latest @@ -88,7 +88,7 @@ informative: - ins: M. Bishop name: Mike Bishop - org: Microsoft + org: Akamai Technologies role: editor @@ -142,33 +142,34 @@ TLS provides two endpoints with a way to establish a means of communication over an untrusted medium (that is, the Internet) that ensures that messages they exchange cannot be observed, modified, or forged. -Internally, TLS is a layered protocol, with the structure shown below: +Internally, TLS is a layered protocol, with the structure shown in +{{tls-layers}}. ~~~~ -+--------------+--------------+--------------+ -| Handshake | Alerts | Application | -| Layer | | Data | -| | | | -+--------------+--------------+--------------+ -| | -| Record Layer | -| | -+--------------------------------------------+ + +-------------+------------+--------------+---------+ +Handshake | | | Application | | +Layer | Handshake | Alerts | Data | ... | + | | | | | + +-------------+------------+--------------+---------+ +Record | | +Layer | Records | + | | + +---------------------------------------------------+ ~~~~ +{: #tls-layers title="TLS Layers"} -Each upper layer (handshake, alerts, and application data) is carried as a -series of typed TLS records. Records are individually cryptographically -protected and then transmitted over a reliable transport (typically TCP) which -provides sequencing and guaranteed delivery. +Each Handshake layer message (e.g., Handshake, Alerts, and Application Data) is +carried as a series of typed TLS records by the Record layer. Records are +individually cryptographically protected and then transmitted over a reliable +transport (typically TCP) which provides sequencing and guaranteed delivery. -Change Cipher Spec records cannot be sent in QUIC. - -The TLS authenticated key exchange occurs between two entities: client and +The TLS authenticated key exchange occurs between two endpoints: client and server. The client initiates the exchange and the server responds. If the key exchange completes successfully, both client and server will agree on a secret. -TLS supports both pre-shared key (PSK) and Diffie-Hellman (DH) key exchanges. -PSK is the basis for 0-RTT; the latter provides perfect forward secrecy (PFS) -when the DH keys are destroyed. +TLS supports both pre-shared key (PSK) and Diffie-Hellman over either finite +fields or elliptic curves ((EC)DHE) key exchanges. PSK is the basis for 0-RTT; +the latter provides perfect forward secrecy (PFS) when the (EC)DHE keys are +destroyed. After completing the TLS handshake, the client will have learned and authenticated an identity for the server and the server is optionally able to @@ -180,18 +181,20 @@ shared secrets that cannot be controlled by either participating peer. TLS provides two basic handshake modes of interest to QUIC: - * A full 1-RTT handshake in which the client is able to send application data + * A full 1-RTT handshake in which the client is able to send Application Data after one round trip and the server immediately responds after receiving the first handshake message from the client. * A 0-RTT handshake in which the client uses information it has previously - learned about the server to send application data immediately. This - application data can be replayed by an attacker so it MUST NOT carry a + learned about the server to send Application Data immediately. This + Application Data can be replayed by an attacker so it MUST NOT carry a self-contained trigger for any non-idempotent action. A simplified TLS handshake with 0-RTT application data is shown in {{tls-full}}. Note that this omits the EndOfEarlyData message, which is not used in QUIC (see -{{remove-eoed}}). +{{remove-eoed}}). Likewise, neither ChangeCipherSpec nor KeyUpdate messages are +used by QUIC; ChangeCipherSpec is redundant in TLS 1.3 and QUIC has defined its +own key update mechanism {{key-update}}. ~~~ Client Server @@ -206,10 +209,10 @@ Note that this omits the EndOfEarlyData message, which is not used in QUIC (see [Application Data] <-------> [Application Data] - () Indicates messages protected by early data (0-RTT) keys - {} Indicates messages protected using handshake keys - [] Indicates messages protected using application data - (1-RTT) keys + () Indicates messages protected by Early Data (0-RTT) Keys + {} Indicates messages protected using Handshake Keys + [] Indicates messages protected using Application Data + (1-RTT) Keys ~~~ {: #tls-full title="TLS Handshake with 0-RTT"} @@ -220,12 +223,12 @@ Data is protected using a number of encryption levels: - Handshake Keys - Application Data (1-RTT) Keys -Application data may appear only in the early data and application data +Application Data may appear only in the Early Data and Application Data levels. Handshake and Alert messages may appear in any level. The 0-RTT handshake is only possible if the client and server have previously communicated. In the 1-RTT handshake, the client is unable to send protected -application data until it has received all of the handshake messages sent by the +Application Data until it has received all of the Handshake messages sent by the server. @@ -236,10 +239,9 @@ integrity protection of packets. For this it uses keys derived from a TLS handshake {{!TLS13}}, but instead of carrying TLS records over QUIC (as with TCP), TLS Handshake and Alert messages are carried directly over the QUIC transport, which takes over the responsibilities of the TLS record layer, as -shown below. +shown in {{quic-layers}}. ~~~~ - +--------------+--------------+ +-------------+ | TLS | TLS | | QUIC | | Handshake | Alerts | | Applications| @@ -255,14 +257,14 @@ shown below. | | +---------------------------------------------+ ~~~~ - +{: #quic-layers title="QUIC Layers"} QUIC also relies on TLS for authentication and negotiation of parameters that are critical to security and performance. -Rather than a strict layering, these two protocols are co-dependent: QUIC uses -the TLS handshake; TLS uses the reliability, ordered delivery, and record -layer provided by QUIC. +Rather than a strict layering, these two protocols cooperate: QUIC uses the TLS +handshake; TLS uses the reliability, ordered delivery, and record layer provided +by QUIC. At a high level, there are two main interactions between the TLS and QUIC components: @@ -326,8 +328,11 @@ encryption levels: - PADDING and PING frames MAY appear in packets of any encryption level. -- CRYPTO and CONNECTION_CLOSE frames MAY appear in packets of any encryption - level except 0-RTT. +- CRYPTO frames and CONNECTION_CLOSE frames signaling errors at the QUIC layer + (type 0x1c) MAY appear in packets of any encryption level except 0-RTT. + +- CONNECTION_CLOSE frames signaling application errors (type 0x1d) MUST only be + sent in packets at the 1-RTT encryption level. - ACK frames MAY appear in packets of any encryption level other than 0-RTT, but can only acknowledge packets which appeared in that packet number space. @@ -335,7 +340,8 @@ encryption levels: - All other frame types MUST only be sent in the 0-RTT and 1-RTT levels. Note that it is not possible to send the following frames in 0-RTT for various -reasons: ACK, CRYPTO, NEW_TOKEN, PATH_RESPONSE, and RETIRE_CONNECTION_ID. +reasons: ACK, CRYPTO, HANDSHAKE_DONE, NEW_TOKEN, PATH_RESPONSE, and +RETIRE_CONNECTION_ID. Because packets could be reordered on the wire, QUIC uses the packet type to indicate which level a given packet was encrypted under, as shown in @@ -385,13 +391,15 @@ perspective of the endpoint in question. ### Handshake Confirmed {#handshake-confirmed} -In this document, the TLS handshake is considered confirmed at an endpoint when -the following two conditions are met: the handshake is complete, and the -endpoint has received an acknowledgment for a packet sent with 1-RTT keys. -This second condition can be implemented by recording the lowest packet number -sent with 1-RTT keys, and the highest value of the Largest Acknowledged field -in any received 1-RTT ACK frame: once the latter is higher than or equal to the -former, the handshake is confirmed. +In this document, the TLS handshake is considered confirmed at the server when +the handshake completes. At the client, the handshake is considered confirmed +when a HANDSHAKE_DONE frame is received. + +A client MAY consider the handshake to be confirmed when it receives an +acknowledgement for a 1-RTT packet. This can be implemented by recording the +lowest packet number sent with 1-RTT keys, and comparing it to the Largest +Acknowledged field in any received 1-RTT ACK frame: once the latter is greater +than or equal to the former, the handshake is confirmed. ### Sending and Receiving Handshake Messages @@ -573,11 +581,16 @@ older than 1.3 is negotiated. ## ClientHello Size {#clienthello-size} -QUIC requires that the first Initial packet from a client contain an entire -cryptographic handshake message, which for TLS is the ClientHello. Though a -packet larger than 1200 bytes might be supported by the path, a client improves -the likelihood that a packet is accepted if it ensures that the first -ClientHello message is small enough to stay within this limit. +The first Initial packet from a client contains the start or all of its first +cryptographic handshake message, which for TLS is the ClientHello. Servers +might need to parse the entire ClientHello (e.g., to access extensions such as +Server Name Identification (SNI) or Application Layer Protocol Negotiation +(ALPN)) in order to decide whether to accept the new incoming QUIC connection. +If the ClientHello spans multiple Initial packets, such servers would need to +buffer the first received fragments, which could consume excessive resources if +the client's address has not yet been validated. To avoid this, servers MAY +use the Retry feature (see Section 8.1 of {{QUIC-TRANSPORT}}) to only buffer +partial ClientHello messages from clients with a validated address. QUIC packet and framing add at least 36 bytes of overhead to the ClientHello message. That overhead increases if the client chooses a connection ID without @@ -592,12 +605,9 @@ QUIC transport parameters, and other negotiable parameters and extensions could cause this message to grow. For servers, in addition to connection IDs and tokens, the size of TLS session -tickets can have an effect on a client's ability to connect. Minimizing the -size of these values increases the probability that they can be successfully -used by a client. - -A client is not required to fit the ClientHello that it sends in response to a -HelloRetryRequest message into a single UDP datagram. +tickets can have an effect on a client's ability to connect efficiently. +Minimizing the size of these values increases the probability that clients can +use them and still fit their ClientHello message in their first Initial packet. The TLS implementation does not need to ensure that the ClientHello is sufficiently large. QUIC PADDING frames are added to increase the size of the @@ -760,15 +770,9 @@ and ignoring any outstanding Initial packets. ### Discarding Handshake Keys -An endpoint MUST NOT discard its handshake keys until the TLS handshake is -confirmed ({{handshake-confirmed}}). An endpoint SHOULD discard its handshake -keys as soon as it has confirmed the handshake. Most application protocols -will send data after the handshake, resulting in acknowledgements that allow -both endpoints to discard their handshake keys promptly. Endpoints that do -not have reason to send immediately after completing the handshake MAY send -ack-eliciting frames, such as PING, which will cause the handshake to be -confirmed when they are acknowledged. - +An endpoint MUST discard its handshake keys when the TLS handshake is confirmed +({{handshake-confirmed}}). The server MUST send a HANDSHAKE_DONE frame as soon +as it completes the handshake. ### Discarding 0-RTT Keys @@ -826,8 +830,7 @@ TLS 1.3 (see {{initial-secrets}}). ## Initial Secrets {#initial-secrets} Initial packets are protected with a secret derived from the Destination -Connection ID field from the client's first Initial packet of the -connection. Specifically: +Connection ID field from the client's Initial packet. Specifically: ~~~ initial_salt = 0xc3eef712c72ebb5a11a7d2432bb46365bef9f502 @@ -859,8 +862,10 @@ modifying the contents of packets from future versions. The HKDF-Expand-Label function defined in TLS 1.3 MUST be used for Initial packets even where the TLS versions offered do not include TLS 1.3. -{{test-vectors-initial}} contains test vectors for the initial packet -encryption. +The secrets used for protecting Initial packets change when a server sends a +Retry packet to use the connection ID value selected by the server. The secrets +do not change when a client changes the Destination Connection ID it uses in +response to an Initial packet from the server. Note: @@ -870,6 +875,8 @@ Note: that the server received its packet; the client has to rely on the exchange that included the Retry packet for that property. +{{test-vectors}} contains test vectors for packet encryption. + ## AEAD Usage {#aead} @@ -1165,25 +1172,34 @@ Note: Due to reordering and loss, protected packets might be received by an endpoint before the final TLS handshake messages are received. A client will be unable to decrypt 1-RTT packets from the server, whereas a server will be able to -decrypt 1-RTT packets from the client. +decrypt 1-RTT packets from the client. Endpoints in either role MUST NOT +decrypt 1-RTT packets from their peer prior to completing the handshake. Even though 1-RTT keys are available to a server after receiving the first handshake messages from a client, it is missing assurances on the client state: - The client is not authenticated, unless the server has chosen to use a -pre-shared key and validated the client's pre-shared key binder; see -Section 4.2.11 of {{!TLS13}}. + pre-shared key and validated the client's pre-shared key binder; see Section + 4.2.11 of {{!TLS13}}. + - The client has not demonstrated liveness, unless a RETRY packet was used. + - Any received 0-RTT data that the server responds to might be due to a replay -attack. + attack. + +Therefore, the server's use of 1-RTT keys MUST be limited to sending data before +the handshake is complete. A server MUST NOT process incoming 1-RTT protected +packets before the TLS handshake is complete. Because sending acknowledgments +indicates that all frames in a packet have been processed, a server cannot send +acknowledgments for 1-RTT packets until the TLS handshake is complete. Received +packets protected with 1-RTT keys MAY be stored and later decrypted and used +once the handshake is complete. + +Note: -Therefore, the server's use of 1-RTT keys is limited before the handshake is -complete. A server MUST NOT process data from incoming 1-RTT -protected packets before the TLS handshake is complete. Because -sending acknowledgments indicates that all frames in a packet have been -processed, a server cannot send acknowledgments for 1-RTT packets until the -TLS handshake is complete. Received packets protected with 1-RTT keys MAY be -stored and later decrypted and used once the handshake is complete. +: TLS implementations might provide all 1-RTT secrets prior to handshake + completion. Even where QUIC implementations have 1-RTT read keys, those keys + cannot be used prior to completing the handshake. The requirement for the server to wait for the client Finished message creates a dependency on that message being delivered. A client can avoid the @@ -1197,91 +1213,291 @@ TLS ClientHello. The server MAY retain these packets for later decryption in anticipation of receiving a ClientHello. +## Retry Packet Integrity {#retry-integrity} + +Retry packets (see the Retry Packet section of {{QUIC-TRANSPORT}}) carry a +Retry Integrity Tag that provides two properties: it allows discarding +packets that have accidentally been corrupted by the network, and it diminishes +off-path attackers' ability to send valid Retry packets. + +The Retry Integrity Tag is a 128-bit field that is computed as the output of +AEAD_AES_128_GCM {{!AEAD=RFC5116}} used with the following inputs: + +- The secret key, K, is 128 bits equal to 0x4d32ecdb2a2133c841e4043df27d4430. +- The nonce, N, is 96 bits equal to 0x4d1611d05513a552c587d575. +- The plaintext, P, is empty. +- The associated data, A, is the contents of the Retry Pseudo-Packet, as + illustrated in {{retry-pseudo}}: + +The secret key and the nonce are values derived by calling HKDF-Expand-Label +using 0x656e61e336ae9417f7f0edd8d78d461e2aa7084aba7a14c1e9f726d55709169a as the +secret, with labels being "quic key" and "quic iv" ({{protection-keys}}). + +~~~ + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| ODCID Len (8) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Original Destination Connection ID (0..160) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +|1|1| 3 | Unused| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Version (32) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| DCID Len (8) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Destination Connection ID (0..160) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| SCID Len (8) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Source Connection ID (0..160) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Retry Token (*) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +~~~ +{: #retry-pseudo title="Retry Pseudo-Packet"} + +The Retry Pseudo-Packet is not sent over the wire. It is computed by taking +the transmitted Retry packet, removing the Retry Integrity Tag and prepending +the two following fields: + +ODCID Len: + +: The ODCID Len contains the length in bytes of the Original Destination + Connection ID field that follows it, encoded as an 8-bit unsigned integer. + +Original Destination Connection ID: + +: The Original Destination Connection ID contains the value of the Destination + Connection ID from the Initial packet that this Retry is in response to. The + length of this field is given in ODCID Len. The presence of this field + mitigates an off-path attacker's ability to inject a Retry packet. + + # Key Update -Once the handshake is confirmed, it is possible to update the keys. The -KEY_PHASE bit in the short header is used to indicate whether key updates -have occurred. The KEY_PHASE bit is initially set to 0 and then inverted -with each key update. +Once the handshake is confirmed (see {{handshake-confirmed}}), an endpoint MAY +initiate a key update. + +The Key Phase bit indicates which packet protection keys are used to protect the +packet. The Key Phase bit is initially set to 0 for the first set of 1-RTT +packets and toggled to signal each subsequent key update. -The KEY_PHASE bit allows a recipient to detect a change in keying material -without necessarily needing to receive the first packet that triggered the -change. An endpoint that notices a changed KEY_PHASE bit can update keys and -decrypt the packet that contains the changed bit. +The Key Phase bit allows a recipient to detect a change in keying material +without needing to receive the first packet that triggered the change. An +endpoint that notices a changed Key Phase bit updates keys and decrypts the +packet that contains the changed value. This mechanism replaces the TLS KeyUpdate message. Endpoints MUST NOT send a TLS KeyUpdate message. Endpoints MUST treat the receipt of a TLS KeyUpdate message as a connection error of type 0x10a, equivalent to a fatal TLS alert of unexpected_message (see {{tls-errors}}). -An endpoint MUST NOT initiate the first key update until the handshake is -confirmed ({{handshake-confirmed}}). An endpoint MUST NOT initiate a subsequent -key update until it has received an acknowledgment for a packet sent at the -current KEY_PHASE. This can be implemented by tracking the lowest packet -number sent with each KEY_PHASE, and the highest acknowledged packet number -in the 1-RTT space: once the latter is higher than or equal to the former, -another key update can be initiated. - -Endpoints MAY limit the number of keys they retain to two sets for removing -packet protection and one set for protecting packets. Older keys can be -discarded. Updating keys multiple times rapidly can cause packets to be -effectively lost if packets are significantly reordered. Therefore, an -endpoint SHOULD NOT initiate a key update for some time after it has last -updated keys; the RECOMMENDED time period is three times the PTO. This avoids -valid reordered packets being dropped by the peer as a result of the peer -discarding older keys. - -A receiving endpoint detects an update when the KEY_PHASE bit does not match -what it is expecting. It creates a new secret (see Section 7.2 of {{!TLS13}}) -and the corresponding read key and IV using the KDF function provided by TLS. -The header protection key is not updated. - -If the packet can be decrypted and authenticated using the updated key and IV, -then the keys the endpoint uses for packet protection are also updated. The -next packet sent by the endpoint MUST then use the new keys. Once an endpoint -has sent a packet encrypted with a given key phase, it MUST NOT send a packet -encrypted with an older key phase. - -An endpoint does not always need to send packets when it detects that its peer -has updated keys. The next packet that it sends will simply use the new keys. -If an endpoint detects a second update before it has sent any packets with -updated keys, it indicates that its peer has updated keys twice without awaiting -a reciprocal update. An endpoint MUST treat consecutive key updates as a fatal -error and abort the connection. - -An endpoint SHOULD retain old keys for a period of no more than three times the -PTO. After this period, old keys and their corresponding secrets SHOULD be -discarded. Retaining keys allow endpoints to process packets that were sent -with old keys and delayed in the network. Packets with higher packet numbers -always use the updated keys and MUST NOT be decrypted with old keys. - -This ensures that once the handshake is complete, packets with the same -KEY_PHASE will have the same packet protection keys, unless there are multiple -key updates in a short time frame succession and significant packet reordering. +{{ex-key-update}} shows a key update process, where the initial set of keys used +(identified with @M) are replaced by updated keys (identified with @N). The +value of the Key Phase bit is indicated in brackets \[]. ~~~ Initiating Peer Responding Peer -@M QUIC Frames - New Keys -> @N -@N QUIC Frames +@M [0] QUIC Packets + +... Update to @N +@N [1] QUIC Packets --------> - QUIC Frames @M - New Keys -> @N - QUIC Frames @N + Update to @N ... + QUIC Packets [1] @N <-------- + QUIC Packets [1] @N + containing ACK + <-------- +... Key Update Permitted + +@N [1] QUIC Packets + containing ACK for @N packets + --------> + Key Update Permitted ... ~~~ {: #ex-key-update title="Key Update"} -A packet that triggers a key update could arrive after the receiving endpoint -successfully processed a packet with a higher packet number. This is only -possible if there is a key compromise and an attack, or if the peer is -incorrectly reverting to use of old keys. Because the latter cannot be -differentiated from an attack, an endpoint MUST immediately terminate the -connection if it detects this condition. -In deciding when to update keys, endpoints MUST NOT exceed the limits for use of -specific keys, as described in Section 5.5 of {{!TLS13}}. +## Initiating a Key Update {#key-update-initiate} + +Endpoints maintain separate read and write secrets for packet protection. An +endpoint initiates a key update by updating its packet protection write secret +and using that to protect new packets. The endpoint creates a new write secret +from the existing write secret as performed in Section 7.2 of {{!TLS13}}. This +uses the KDF function provided by TLS with a label of "quic ku". The +corresponding key and IV are created from that secret as defined in +{{protection-keys}}. The header protection key is not updated. + +For example, to update write keys with TLS 1.3, HKDF-Expand-Label is used as: + +~~~ +secret_ = HKDF-Expand-Label(secret_, "quic ku", + "", Hash.length) +~~~ + +The endpoint toggles the value of the Key Phase bit and uses the updated key and +IV to protect all subsequent packets. + +An endpoint MUST NOT initiate a key update prior to having confirmed the +handshake ({{handshake-confirmed}}). An endpoint MUST NOT initiate a subsequent +key update prior unless it has received an acknowledgment for a packet that was +sent protected with keys from the current key phase. This ensures that keys are +available to both peers before another key update can be initiated. This can be +implemented by tracking the lowest packet number sent with each key phase, and +the highest acknowledged packet number in the 1-RTT space: once the latter is +higher than or equal to the former, another key update can be initiated. + +Note: + +: Keys of packets other than the 1-RTT packets are never updated; their keys are + derived solely from the TLS handshake state. + +The endpoint that initiates a key update also updates the keys that it uses for +receiving packets. These keys will be needed to process packets the peer sends +after updating. + +An endpoint SHOULD retain old keys so that packets sent by its peer prior to +receiving the key update can be processed. Discarding old keys too early can +cause delayed packets to be discarded. Discarding packets will be interpreted +as packet loss by the peer and could adversely affect performance. + + +## Responding to a Key Update + +A peer is permitted to initiate a key update after receiving an acknowledgement +of a packet in the current key phase. An endpoint detects a key update when +processing a packet with a key phase that differs from the value last used to +protect the last packet it sent. To process this packet, the endpoint uses the +next packet protection key and IV. See {{receive-key-generation}} for +considerations about generating these keys. + +If a packet is successfully processed using the next key and IV, then the peer +has initiated a key update. The endpoint MUST update its send keys to the +corresponding key phase in response, as described in {{key-update-initiate}}. +Sending keys MUST be updated before sending an acknowledgement for the packet +that was received with updated keys. By acknowledging the packet that triggered +the key update in a packet protected with the updated keys, the endpoint signals +that the key update is complete. + +An endpoint can defer sending the packet or acknowledgement according to its +normal packet sending behaviour; it is not necessary to immediately generate a +packet in response to a key update. The next packet sent by the endpoint will +use the updated keys. The next packet that contains an acknowledgement will +cause the key update to be completed. If an endpoint detects a second update +before it has sent any packets with updated keys containing an +acknowledgement for the packet that initiated the key update, it indicates that +its peer has updated keys twice without awaiting confirmation. An endpoint MAY +treat consecutive key updates as a connection error of type KEY_UPDATE_ERROR. + +An endpoint that receives an acknowledgement that is carried in a packet +protected with old keys where any acknowledged packet was protected with newer +keys MAY treat that as a connection error of type KEY_UPDATE_ERROR. This +indicates that a peer has received and acknowledged a packet that initiates a +key update, but has not updated keys in response. + + +## Timing of Receive Key Generation {#receive-key-generation} + +Endpoints responding to an apparent key update MUST NOT generate a timing +side-channel signal that might indicate that the Key Phase bit was invalid (see +{{header-protect-analysis}}). Endpoints can use dummy packet protection keys in +place of discarded keys when key updates are not yet permitted. Using dummy +keys will generate no variation in the timing signal produced by attempting to +remove packet protection, and results in all packets with an invalid Key Phase +bit being rejected. + +The process of creating new packet protection keys for receiving packets could +reveal that a key update has occurred. An endpoint MAY perform this process as +part of packet processing, but this creates a timing signal that can be used by +an attacker to learn when key updates happen and thus the value of the Key Phase +bit in certain packets. Endpoints MAY instead defer the creation of the next +set of receive packet protection keys until some time after a key update +completes, up to three times the PTO; see {{old-keys-recv}}. + +Once generated, the next set of packet protection keys SHOULD be retained, even +if the packet that was received was subsequently discarded. Packets containing +apparent key updates are easy to forge and - while the process of key update +does not require significant effort - triggering this process could be used by +an attacker for DoS. + +For this reason, endpoints MUST be able to retain two sets of packet protection +keys for receiving packets: the current and the next. Retaining the previous +keys in addition to these might improve performance, but this is not essential. + + +## Sending with Updated Keys {#old-keys-send} + +An endpoint always sends packets that are protected with the newest keys. Keys +used for packet protection can be discarded immediately after switching to newer +keys. + +Packets with higher packet numbers MUST be protected with either the same or +newer packet protection keys than packets with lower packet numbers. An +endpoint that successfully removes protection with old keys when newer keys were +used for packets with lower packet numbers MUST treat this as a connection error +of type KEY_UPDATE_ERROR. + + +## Receiving with Different Keys {#old-keys-recv} + +For receiving packets during a key update, packets protected with older keys +might arrive if they were delayed by the network. Retaining old packet +protection keys allows these packets to be successfully processed. + +As packets protected with keys from the next key phase use the same Key Phase +value as those protected with keys from the previous key phase, it can be +necessary to distinguish between the two. This can be done using packet +numbers. A recovered packet number that is lower than any packet number from +the current key phase uses the previous packet protection keys; a recovered +packet number that is higher than any packet number from the current key phase +requires the use of the next packet protection keys. + +Some care is necessary to ensure that any process for selecting between +previous, current, and next packet protection keys does not expose a timing side +channel that might reveal which keys were used to remove packet protection. See +{{hp-side-channel}} for more information. + +Alternatively, endpoints can retain only two sets of packet protection keys, +swapping previous for next after enough time has passed to allow for reordering +in the network. In this case, the Key Phase bit alone can be used to select +keys. + +An endpoint MAY allow a period of approximately the Probe Timeout (PTO; see +{{QUIC-RECOVERY}}) after a key update before it creates the next set of packet +protection keys. These updated keys MAY replace the previous keys at that time. +With the caveat that PTO is a subjective measure - that is, a peer could have a +different view of the RTT - this time is expected to be long enough that any +reordered packets would be declared lost by a peer even if they were +acknowledged and short enough to allow for subsequent key updates. + +Endpoints need to allow for the possibility that a peer might not be able to +decrypt packets that initiate a key update during the period when it retains old +keys. Endpoints SHOULD wait three times the PTO before initiating a key update +after receiving an acknowledgment that confirms that the previous key update was +received. Failing to allow sufficient time could lead to packets being +discarded. + +An endpoint SHOULD retain old read keys for no more than three times the PTO. +After this period, old read keys and their corresponding secrets SHOULD be +discarded. + + +## Key Update Frequency + +Key updates MUST be initiated before usage limits on packet protection keys are +exceeded. For the cipher suites mentioned in this document, the limits in +Section 5.5 of {{!TLS13}} apply. Other cipher suites MUST define usage limits +in order to be used with QUIC. + + +## Key Update Error Code {#key-update-error} + +The KEY_UPDATE_ERROR error code (0xE) is used to signal errors related to key +updates. # Security of Initial Messages @@ -1316,27 +1532,28 @@ parameters and allows a server to perform return routability checks on clients. QUIC requires that the cryptographic handshake provide authenticated protocol negotiation. TLS uses Application Layer Protocol Negotiation (ALPN) -{{!RFC7301}} to select an application protocol. Unless another mechanism is -used for agreeing on an application protocol, endpoints MUST use ALPN for this -purpose. When using ALPN, endpoints MUST immediately close a connection (see -Section 10.3 in {{QUIC-TRANSPORT}}) if an application protocol is not -negotiated with a no_application_protocol TLS alert (QUIC error code 0x178, -see {{tls-errors}}). While {{!RFC7301}} only specifies that servers use this -alert, QUIC clients MUST also use it to terminate a connection when ALPN -negotiation fails. - -An application-layer protocol MAY restrict the QUIC versions that it can operate -over. Servers MUST select an application protocol compatible with the QUIC -version that the client has selected. If the server cannot select a compatible -combination of application protocol and QUIC version, it MUST abort the -connection. A client MUST abort a connection if the server picks an application -protocol incompatible with the protocol version being used. +{{!ALPN=RFC7301}} to select an application protocol. Unless another mechanism +is used for agreeing on an application protocol, endpoints MUST use ALPN for +this purpose. When using ALPN, endpoints MUST immediately close a connection +(see Section 10.3 in {{QUIC-TRANSPORT}}) if an application protocol is not +negotiated with a no_application_protocol TLS alert (QUIC error code 0x178, see +{{tls-errors}}). While {{!ALPN}} only specifies that servers use this alert, +QUIC clients MUST also use it to terminate a connection when ALPN negotiation +fails. + +An application protocol MAY restrict the QUIC versions that it can operate over. +Servers MUST select an application protocol compatible with the QUIC version +that the client has selected. The server MUST treat the inability to select a +compatible application protocol as a connection error of type 0x178 +(no_application_protocol). Similarly, a client MUST treat the selection of an +incompatible application protocol by a server as a connection error of type +0x178. ## QUIC Transport Parameters Extension {#quic_parameters} QUIC transport parameters are carried in a TLS extension. Different versions of -QUIC might define a different format for this struct. +QUIC might define a different method for negotiating transport configuration. Including transport parameters in the TLS handshake provides integrity protection for these values. @@ -1348,9 +1565,7 @@ protection for these values. ~~~ The `extension_data` field of the quic_transport_parameters extension contains a -value that is defined by the version of QUIC that is in use. The -quic_transport_parameters extension carries a TransportParameters struct when -the version of QUIC defined in {{QUIC-TRANSPORT}} is used. +value that is defined by the version of QUIC that is in use. The quic_transport_parameters extension is carried in the ClientHello and the EncryptedExtensions messages during the handshake. Endpoints MUST send the @@ -1491,20 +1706,39 @@ authenticated using packet protection; the entire packet header is part of the authenticated additional data. Protected fields that are falsified or modified can only be detected once the packet protection is removed. -An attacker could guess values for packet numbers and have an endpoint confirm -guesses through timing side channels. Similarly, guesses for the packet number -length can be trialed and exposed. If the recipient of a packet discards -packets with duplicate packet numbers without attempting to remove packet -protection they could reveal through timing side-channels that the packet number -matches a received packet. For authentication to be free from side-channels, -the entire process of header protection removal, packet number recovery, and -packet protection removal MUST be applied together without timing and other -side-channels. + +## Header Protection Timing Side-Channels {#hp-side-channel} + +An attacker could guess values for packet numbers or Key Phase and have an +endpoint confirm guesses through timing side channels. Similarly, guesses for +the packet number length can be trialed and exposed. If the recipient of a +packet discards packets with duplicate packet numbers without attempting to +remove packet protection they could reveal through timing side-channels that the +packet number matches a received packet. For authentication to be free from +side-channels, the entire process of header protection removal, packet number +recovery, and packet protection removal MUST be applied together without timing +and other side-channels. For the sending of packets, construction and protection of packet payloads and packet numbers MUST be free from side-channels that would reveal the packet number or its encoded size. +During a key update, the time taken to generate new keys could reveal through +timing side-channels that a key update has occurred. Alternatively, where an +attacker injects packets this side-channel could reveal the value of the Key +Phase on injected packets. After receiving a key update, an endpoint SHOULD +generate and save the next set of receive packet protection keys, as described +in {{receive-key-generation}}. By generating new keys before a key update is +received, receipt of packets will not create timing signals that leak the value +of the Key Phase. + +This depends on not doing this key generation during packet processing and it +can require that endpoints maintain three sets of packet protection keys for +receiving: for the previous key phase, for the current key phase, and for the +next key phase. Endpoints can instead choose to defer generation of the next +receive packet protection keys until they discard old keys so that only two sets +of receive keys need to be retained at any point in time. + ## Key Diversity @@ -1533,21 +1767,24 @@ secrets. This document does not create any new IANA registries, but it registers the values in the following registries: -* TLS ExtensionsType Registry {{!TLS-REGISTRIES=RFC8447}} - IANA is to register - the quic_transport_parameters extension found in {{quic_parameters}}. The - Recommended column is to be marked Yes. The TLS 1.3 Column is to include CH - and EE. +* TLS ExtensionType Values Registry {{!TLS-REGISTRIES=RFC8447}} - IANA is to + register the quic_transport_parameters extension found in {{quic_parameters}}. + The Recommended column is to be marked Yes. The TLS 1.3 Column is to include + CH and EE. + +* QUIC Transport Error Codes Registry {{QUIC-TRANSPORT}} - IANA is to register + the KEY_UPDATE_ERROR (0xE), as described in {{key-update-error}}. --- back -# Sample Initial Packet Protection {#test-vectors-initial} +# Sample Packet Protection {#test-vectors} -This section shows examples of packet protection for Initial packets so that -implementations can be verified incrementally. These packets use an 8-byte -client-chosen Destination Connection ID of 0x8394c8f03e515708. Values for both -server and client packet protection are shown together with values in -hexadecimal. +This section shows examples of packet protection so that implementations can be +verified incrementally. Samples of Initial packets from both client and server, +plus a Retry packet are defined. These packets use an 8-byte client-chosen +Destination Connection ID of 0x8394c8f03e515708. Some intermediate values are +included. All values are shown in hexadecimal. ## Keys @@ -1614,7 +1851,7 @@ hp = HKDF-Expand-Label(server_initial_secret, "quic hp", _, 16) ~~~ -## Client Initial +## Client Initial {#sample-client-initial} The client sends an Initial packet. The unprotected payload of this packet contains the following CRYPTO frame, plus enough PADDING frames to make a 1162 @@ -1634,7 +1871,7 @@ The unprotected header includes the connection ID and a 4 byte packet number encoding for a packet number of 2: ~~~ -c3ff000017088394c8f03e5157080000449e00000002 +c3ff000019088394c8f03e5157080000449e00000002 ~~~ Protecting the payload produces output that is sampled for header protection. @@ -1649,15 +1886,15 @@ mask = AES-ECB(hp, sample)[0..4] header[0] ^= mask[0] & 0x0f = c0 -header[17..20] ^= mask[1..4] +header[18..21] ^= mask[1..4] = 3b343aa8 -header = c0ff000017088394c8f03e5157080000449e3b343aa8 +header = c0ff000019088394c8f03e5157080000449e3b343aa8 ~~~ The resulting protected packet is: ~~~ -c0ff000017088394c8f03e5157080000 449e3b343aa8535064a4268a0d9d7b1c +c0ff000019088394c8f03e5157080000 449e3b343aa8535064a4268a0d9d7b1c 9d250ae355162276e9b1e3011ef6bbc0 ab48ad5bcc2681e953857ca62becd752 4daac473e68d7405fbba4e9ee616c870 38bdbe908c06d9605d9ac49030359eec b1d05a14e117db8cede2bb09d0dbbfee 271cb374d8f10abec82d0f59a1dee29f @@ -1694,9 +1931,10 @@ eaf45a9bf27dc0c1e784161691220913 13eb0e87555abd706626e557fc36a04f cd191a58829104d6075c5594f627ca50 6bf181daec940f4a4f3af0074eee89da acde6758312622d4fa675b39f728e062 d2bee680d8f41a597c262648bb18bcfc 13c8b3d97b1a77b2ac3af745d61a34cc 4709865bac824a94bb19058015e4e42d -c9be6c7803567321829dd85853396269 +aebe13f98ec51170a4aad0a8324bb768 ~~~ + ## Server Initial The server sends the following payload in response, including an ACK frame, a @@ -1713,7 +1951,7 @@ The header from the server includes a new connection ID and a 2-byte packet number encoding for a packet number of 1: ~~~ -c1ff0000170008f067a5502a4262b50040740001 +c1ff0000190008f067a5502a4262b50040740001 ~~~ As a result, after protection, the header protection sample is taken starting @@ -1722,17 +1960,30 @@ from the third protected octet: ~~~ sample = 7002596f99ae67abf65a5852f54f58c3 mask = 38168a0c25 -header = c9ff0000170008f067a5502a4262b5004074168b +header = c9ff0000190008f067a5502a4262b5004074168b ~~~ The final protected packet is then: ~~~ -c9ff0000170008f067a5502a4262b500 4074168bf22b7002596f99ae67abf65a +c9ff0000190008f067a5502a4262b500 4074168bf22b7002596f99ae67abf65a 5852f54f58c37c808682e2e40492d8a3 899fb04fc0afe9aabc8767b18a0aa493 537426373b48d502214dd856d63b78ce e37bc664b3fe86d487ac7a77c53038a3 -cd32f0b5004d9f5754c4f7f2d1f35cf3 f7116351c92b9cf9bb6d091ddfc8b32d -432348a2c413 +cd32f0b5004d9f5754c4f7f2d1f35cf3 f7116351c92b99c8ae5833225cb51855 +20d61e68cf5f +~~~ + + +## Retry + +This shows a Retry packet that might be sent in response to the Initial packet +in {{sample-client-initial}}. The integrity check includes the client-chosen +connection ID value of 0x8394c8f03e515708, but that value is not +included in the final Retry packet: + +~~~ +ffff0000190008f067a5502a4262b574 6f6b656e1e5ec5b014cbb1f0fd93df40 +48c446a6 ~~~ @@ -1743,6 +1994,26 @@ cd32f0b5004d9f5754c4f7f2d1f35cf3 f7116351c92b9cf9bb6d091ddfc8b32d Issue and pull request numbers are listed with a leading octothorp. +## Since draft-ietf-quic-tls-25 + +- No changes + +## Since draft-ietf-quic-tls-24 + +- Rewrite key updates (#3050) + - Allow but don't recommend deferring key updates (#2792, #3263) + - More completely define received behavior (#2791) + - Define the label used with HKDF-Expand-Label (#3054) + +## Since draft-ietf-quic-tls-23 + +- Key update text update (#3050): + - Recommend constant-time key replacement (#2792) + - Provide explicit labels for key update key derivation (#3054) +- Allow first Initial from a client to span multiple packets (#2928, #3045) +- PING can be sent at any encryption level (#3034, #3035) + + ## Since draft-ietf-quic-tls-22 - Update the salt used for Initial secrets (#2887, #2980) @@ -1783,7 +2054,7 @@ Issue and pull request numbers are listed with a leading octothorp. - TLS provides an AEAD and KDF function (#2046) - Clarify that the TLS KDF is used with TLS (#1997) - Change the labels for calculation of QUIC keys (#1845, #1971, #1991) -- Initial keys are discarded once Handshake are avaialble (#1951, #2045) +- Initial keys are discarded once Handshake keys are available (#1951, #2045) ## Since draft-ietf-quic-tls-13 @@ -1879,15 +2150,29 @@ No significant changes. - Added status note -# Acknowledgments -{:numbered="false"} - -This document has benefited from input from Dragana Damjanovic, Christian -Huitema, Jana Iyengar, Adam Langley, Roberto Peon, Eric Rescorla, Ian Swett, and -many others. - - # Contributors {:numbered="false"} -Ryan Hamilton was originally an author of this specification. +The IETF QUIC Working Group received an enormous amount of support from many +people. The following people provided substantive contributions to this +document: +Adam Langley, +Alessandro Ghedini, +Christian Huitema, +Christopher Wood, +David Schinazi, +Dragana Damjanovic, +Eric Rescorla, +Ian Swett, +Jana Iyengar, , +Marten Seemann, +Martin Duke, +Mike Bishop, , +Nick Banks, +Nick Harper, +Roberto Peon, +Rui Paulo, +Ryan Hamilton, +and Victor Vasiliev. diff --git a/draft-ietf-quic-transport.md b/draft-ietf-quic-transport.md index d490d57a5b..76156c1be5 100644 --- a/draft-ietf-quic-transport.md +++ b/draft-ietf-quic-transport.md @@ -132,7 +132,7 @@ incurring a dependency on middleboxes. ## Document Structure -This document describes the core QUIC protocol and is structured as follows. +This document describes the core QUIC protocol and is structured as follows: * Streams are the basic service abstraction that QUIC provides. - {{streams}} describes core concepts related to streams, @@ -192,8 +192,15 @@ QUIC packet: Ack-eliciting Packet: -: A QUIC packet that contains frames other than ACK and PADDING. These cause a - recipient to send an acknowledgment (see {{sending-acknowledgements}}). +: A QUIC packet that contains frames other than ACK, PADDING, and + CONNECTION_CLOSE. These cause a recipient to send an acknowledgment (see + {{sending-acknowledgements}}). + +Out-of-order packet: + +: A packet that does not increase the largest received packet number for its + packet number space ({{packet-numbers}}) by exactly one. A packet can arrive + out of order if it is delayed or if earlier packets are lost or delayed. Endpoint: @@ -209,6 +216,11 @@ Server: : The endpoint accepting incoming QUIC connections. +Address: + +: When used without qualification, the tuple of IP version, IP address, UDP + protocol, and UDP port number that represents one end of a network path. + Connection ID: : An opaque identifier that is used to identify a QUIC connection at an @@ -222,7 +234,7 @@ Stream: Application: - : An entity that uses QUIC to send and receive data. +: An entity that uses QUIC to send and receive data. ## Notational Conventions @@ -350,7 +362,7 @@ the relative priority of streams. When deciding which streams to dedicate resources to, the implementation SHOULD use the information provided by the application. -## Required Operations on Streams +## Required Operations on Streams {#stream-operations} There are certain operations which an application MUST be able to perform when interacting with QUIC streams. This document does not specify an API, but @@ -361,17 +373,17 @@ On the sending part of a stream, application protocols need to be able to: - write data, understanding when stream flow control credit ({{data-flow-control}}) has successfully been reserved to send the written - data + data; - end the stream (clean termination), resulting in a STREAM frame ({{frame-stream}}) with the FIN bit set; and - reset the stream (abrupt termination), resulting in a RESET_STREAM frame - ({{frame-reset-stream}}), even if the stream was already ended. + ({{frame-reset-stream}}), if the stream was not already in a terminal state. On the receiving part of a stream, application protocols need to be able to: -- read data +- read data; and - abort reading of the stream and request closure, possibly resulting in a - STOP_SENDING frame ({{frame-stop-sending}}) + STOP_SENDING frame ({{frame-stop-sending}}). Applications also need to be informed of state changes on streams, including when the peer has opened or reset a stream, when a peer aborts reading on a @@ -459,9 +471,8 @@ allocating a stream ID to a stream until it sends the first STREAM frame and enters this state, which can allow for better stream prioritization. The sending part of a bidirectional stream initiated by a peer (type 0 for a -server, type 1 for a client) enters the "Ready" state then immediately -transitions to the "Send" state if the receiving part enters the "Recv" state -({{stream-recv-states}}). +server, type 1 for a client) starts in the "Ready" state when the receiving part +is created. In the "Send" state, an endpoint transmits - and retransmits as necessary - stream data in STREAM frames. The endpoint respects the flow control limits set @@ -784,44 +795,47 @@ flow control limits. If a sender runs out of flow control credit, it will be unable to send new data and is considered blocked. A sender SHOULD send a STREAM_DATA_BLOCKED or DATA_BLOCKED frame to indicate it has data to write but is blocked by flow -control limits. These frames are expected to be sent infrequently in common -cases, but they are considered useful for debugging and monitoring purposes. - -A sender SHOULD NOT send multiple STREAM_DATA_BLOCKED or DATA_BLOCKED frames -for the same data limit, unless the original frame is determined to be lost. -Another STREAM_DATA_BLOCKED or DATA_BLOCKED frame can be sent after the data -limit is increased. +control limits. If a sender is blocked for a period longer than the idle +timeout ({{idle-timeout}}), the connection might be closed even when data is +available for transmission. To keep the connection from closing, a sender that +is flow control limited SHOULD periodically send a STREAM_DATA_BLOCKED or +DATA_BLOCKED frame when it has no ack-eliciting packets in flight. ## Flow Credit Increments {#fc-credit} -This document leaves when and how many bytes to advertise in a MAX_STREAM_DATA -or MAX_DATA frame to implementations, but offers a few considerations. These -frames contribute to connection overhead. Therefore frequently sending frames -with small changes is undesirable. At the same time, larger increments to -limits are necessary to avoid blocking if updates are less frequent, requiring -larger resource commitments at the receiver. Thus there is a trade-off between -resource commitment and overhead when determining how large a limit is -advertised. +Implementations decide when and how much credit to advertise in MAX_STREAM_DATA +and MAX_DATA frames, but this section offers a few considerations. + +To avoid blocking a sender, a receiver can send a MAX_STREAM_DATA or MAX_DATA +frame multiple times within a round trip or send it early enough to allow for +recovery from loss of the frame. + +Control frames contribute to connection overhead. Therefore, frequently sending +MAX_STREAM_DATA and MAX_DATA frames with small changes is undesirable. On the +other hand, if updates are less frequent, larger increments to limits are +necessary to avoid blocking a sender, requiring larger resource commitments at +the receiver. There is a trade-off between resource commitment and overhead +when determining how large a limit is advertised. A receiver can use an autotuning mechanism to tune the frequency and amount of advertised additional credit based on a round-trip time estimate and the rate at which the receiving application consumes data, similar to common TCP -implementations. As an optimization, sending frames related to flow control -only when there are other frames to send or when a peer is blocked ensures that -flow control doesn't cause extra packets to be sent. +implementations. As an optimization, an endpoint could send frames related to +flow control only when there are other frames to send or when a peer is blocked, +ensuring that flow control does not cause extra packets to be sent. -If a sender runs out of flow control credit, it will be unable to send new data -and is considered blocked. It is generally considered best to not let the -sender become blocked. To avoid blocking a sender, and to reasonably account -for the possibility of loss, a receiver should send a MAX_DATA or -MAX_STREAM_DATA frame at least two round trips before it expects the sender to -get blocked. +A blocked sender is not required to send STREAM_DATA_BLOCKED or DATA_BLOCKED +frames. Therefore, a receiver MUST NOT wait for a STREAM_DATA_BLOCKED or +DATA_BLOCKED frame before sending a MAX_STREAM_DATA or MAX_DATA frame; doing so +could result in the sender being blocked for the rest of the connection. Even if +the sender sends these frames, waiting for them will result in the sender being +blocked for at least an entire round trip. -A receiver MUST NOT wait for a STREAM_DATA_BLOCKED or DATA_BLOCKED frame before -sending MAX_STREAM_DATA or MAX_DATA, since doing so will mean that a sender will -be blocked for at least an entire round trip, and potentially for longer if the -peer chooses to not send STREAM_DATA_BLOCKED or DATA_BLOCKED frames. +When a sender receives credit after being blocked, it might be able to send a +large amount of data in response, resulting in short-term congestion; see +Section 6.9 in {{QUIC-RECOVERY}} for a discussion of how a sender can avoid this +congestion. ## Handling Stream Cancellation {#stream-cancellation} @@ -967,10 +981,10 @@ failures in the presence of peer connection migration, NAT rebinding, and client port reuse; and therefore MUST NOT be done unless an endpoint is certain that those protocol features are not in use. -When an endpoint has requested a non-zero-length connection ID, it needs to -ensure that the peer has a supply of connection IDs from which to choose for -packets sent to the endpoint. These connection IDs are supplied by the endpoint -using the NEW_CONNECTION_ID frame ({{frame-new-connection-id}}). +When an endpoint uses a non-zero-length connection ID, it needs to ensure that +the peer has a supply of connection IDs from which to choose for packets sent to +the endpoint. These connection IDs are supplied by the endpoint using the +NEW_CONNECTION_ID frame ({{frame-new-connection-id}}). ### Issuing Connection IDs {#issue-cid} @@ -993,17 +1007,22 @@ When an endpoint issues a connection ID, it MUST accept packets that carry this connection ID for the duration of the connection or until its peer invalidates the connection ID via a RETIRE_CONNECTION_ID frame ({{frame-retire-connection-id}}). Connection IDs that are issued and not -retired are considered active; any active connection ID can be used. +retired are considered active; any active connection ID is valid for use at any +time, in any packet type. This includes the connection ID issued by the server +via the preferred_address transport parameter. An endpoint SHOULD ensure that its peer has a sufficient number of available and unused connection IDs. Endpoints store received connection IDs for future use and advertise the number of connection IDs they are willing to store with the -active_connection_id_limit transport parameter. An endpoint SHOULD NOT provide -more connection IDs than the peer's limit. - -An endpoint SHOULD supply a new connection ID when it receives a packet with a -previously unused connection ID or when the peer retires one, unless providing -the new connection ID would exceed the peer's limit. An endpoint MAY limit the +active_connection_id_limit transport parameter. An endpoint MUST NOT provide +more connection IDs than the peer's limit. An endpoint that receives more +connection IDs than its advertised active_connection_id_limit MUST close the +connection with an error of type CONNECTION_ID_LIMIT_ERROR. + +An endpoint SHOULD supply a new connection ID when the peer retires a connection +ID. If an endpoint provided fewer connection IDs than the peer's +active_connection_id_limit, it MAY supply a new connection ID when it receives +a packet with a previously unused connection ID. An endpoint MAY limit the frequency or the total number of connection IDs issued for each connection to avoid the risk of running out of connection IDs; see {{reset-token}}. @@ -1012,6 +1031,7 @@ SHOULD ensure that the pool of connection IDs available to its peer allows the peer to use a new connection ID on migration, as the peer will close the connection if the pool is exhausted. + ### Consuming and Retiring Connection IDs {#retiring-cids} An endpoint can change the connection ID it uses for a peer to another available @@ -1030,18 +1050,22 @@ packets sent from only one local address. An endpoint that migrates away from a local address SHOULD retire all connection IDs used on that address once it no longer plans to use that address. -An endpoint can request that its peer retire connection IDs by sending a +An endpoint can cause its peer to retire connection IDs by sending a NEW_CONNECTION_ID frame with an increased Retire Prior To field. Upon receipt, -the peer SHOULD retire the corresponding connection IDs and send the -corresponding RETIRE_CONNECTION_ID frames in a timely manner. Failing to do so -can cause packets to be delayed, lost, or cause the original endpoint to send a -stateless reset in response to a connection ID it can no longer route correctly. +the peer MUST first retire the corresponding connection IDs using +RETIRE_CONNECTION_ID frames and then add the newly provided connection ID to the +set of active connection IDs. Failure to retire the connection IDs within +approximately one PTO can cause packets to be delayed, lost, or cause the +original endpoint to send a stateless reset in response to a connection ID it +can no longer route correctly. An endpoint MAY discard a connection ID for which retirement has been requested once an interval of no less than 3 PTO has elapsed since an acknowledgement is -received for the NEW_CONNECTION_ID frame requesting that retirement. Subsequent -incoming packets using that connection ID could elicit a response with the -corresponding stateless reset token. +received for the NEW_CONNECTION_ID frame requesting that retirement. Until +then, the endpoint SHOULD be prepared to receive packets that contain the +connection ID that it has requested be retired. Subsequent incoming packets +using that connection ID could elicit a response with the corresponding +stateless reset token. ## Matching Packets to Connections {#packet-handling} @@ -1050,14 +1074,17 @@ Incoming packets are classified on receipt. Packets can either be associated with an existing connection, or - for servers - potentially create a new connection. -Hosts try to associate a packet with an existing connection. If the packet has a -non-zero-length Destination Connection ID corresponding to an existing +Endpoints try to associate a packet with an existing connection. If the packet +has a non-zero-length Destination Connection ID corresponding to an existing connection, QUIC processes that packet accordingly. Note that more than one connection ID can be associated with a connection; see {{connection-id}}. -If the Destination Connection ID is zero length and the packet matches the -local address and port of a connection where the host used zero-length -connection IDs, QUIC processes the packet as part of that connection. +If the Destination Connection ID is zero length and the addressing information +in the packet matches the addressing information the endpoint uses to identify a +connection with a zero-length connection ID, QUIC processes the packet as part +of that connection. An endpoint can use just destination IP and port or both +source and destination addresses for identification, though this makes +connections fragile as described in {{connection-id}}. Endpoints can send a Stateless Reset ({{stateless-reset}}) for any packets that cannot be attributed to an existing connection. A stateless reset allows a peer @@ -1130,23 +1157,35 @@ Servers MUST drop incoming packets under all other circumstances. ## Life of a QUIC Connection {#connection-lifecycle} -TBD. - - +A QUIC connection is a stateful interaction between a client and server, the +primary purpose of which is to support the exchange of data by an application +protocol. Streams ({{streams}}) are the primary means by which an application +protocol exchanges information. + +Each connection starts with a handshake phase, during which client and server +establish a shared secret using the cryptographic handshake protocol +{{QUIC-TLS}} and negotiate the application protocol. The handshake +({{handshake}}) confirms that both endpoints are willing to communicate +({{validate-handshake}}) and establishes parameters for the connection +({{transport-parameters}}). + +An application protocol can also operate in a limited fashion during the +handshake phase. 0-RTT allows application messages to be sent by a client +before receiving any messages from the server. However, 0-RTT lacks certain key +security guarantees. In particular, there is no protection against replay +attacks in 0-RTT; see {{QUIC-TLS}}. Separately, a server can also send +application data to a client before it receives the final cryptographic +handshake messages that allow it to confirm the identity and liveness of the +client. These capabilities allow an application protocol to offer the option to +trade some security guarantees for reduced latency. + +The use of connection IDs ({{connection-id}}) allows connections to migrate to a +new network path, both as a direct choice of an endpoint and when forced by a +change in a middlebox. {{migration}} describes mitigations for the security and +privacy issues associated with migration. + +For connections that are no longer needed or desired, there are several ways for +a client and server to terminate a connection ({{termination}}). ## Required Operations on Connections @@ -1159,7 +1198,7 @@ the operations described in this section on a QUIC connection. When implementing the client role, applications need to be able to: - open a connection, which begins the exchange described in {{handshake}}; -- enable 0-RTT; and +- enable 0-RTT when available; and - be informed when 0-RTT has been accepted or rejected by a server. When implementing the server role, applications need to be able to: @@ -1230,7 +1269,7 @@ versions of QUIC react to Version Negotiation packets when attempting to establish a connection using this version. How to perform version negotiation is left as future work defined by future versions of QUIC. In particular, that future work will need to ensure robustness against version downgrade -attacks {{version-downgrade}}. +attacks; see {{version-downgrade}}. ### Version Negotiation Between Draft Versions @@ -1239,7 +1278,7 @@ attacks {{version-downgrade}}. When a draft implementation receives a Version Negotiation packet, it MAY use it to attempt a new connection with one of the versions listed in the packet, -instead of abandoning the current connection attempt {{handle-vn}}. +instead of abandoning the current connection attempt; see {{handle-vn}}. The client MUST check that the Destination and Source Connection ID fields match the Source and Destination Connection ID fields in a packet that the @@ -1303,21 +1342,13 @@ properties: * authenticated negotiation of an application protocol (TLS uses ALPN {{?RFC7301}} for this purpose) -The first CRYPTO frame from a client MUST be sent in a single packet. Any -second attempt that is triggered by address validation (see -{{validate-handshake}}) MUST also be sent within a single packet. This avoids -having to reassemble a message from multiple packets. - -The first client packet of the cryptographic handshake protocol MUST fit within -a 1232 byte QUIC packet payload. This includes overheads that reduce the space -available to the cryptographic handshake protocol. - An endpoint can verify support for Explicit Congestion Notification (ECN) in the first packets it sends, as described in {{ecn-validation}}. -The CRYPTO frame can be sent in different packet number spaces. The sequence -numbers used by CRYPTO frames to ensure ordered delivery of cryptographic -handshake data start from zero in each packet number space. +The CRYPTO frame can be sent in different packet number spaces +({{packet-numbers}}). The sequence numbers used by CRYPTO frames to ensure +ordered delivery of cryptographic handshake data start from zero in each +packet number space. Endpoints MUST explicitly negotiate an application protocol. This avoids situations where there is a disagreement about the protocol that is in use. @@ -1597,19 +1628,19 @@ magnitude of any amplification attack that can be mounted using spoofed source addresses. In determining this limit, servers only count the size of successfully processed packets. -Clients MUST ensure that UDP datagrams containing only Initial packets are sized -to at least 1200 bytes, adding padding to packets in the datagram as necessary. -Sending padded datagrams ensures that the server is not overly constrained by -the amplification restriction. +Clients MUST ensure that UDP datagrams containing Initial packets have UDP +payloads of at least 1200 bytes, adding padding to packets in the datagram as +necessary. Sending padded datagrams ensures that the server is not overly +constrained by the amplification restriction. Packet loss, in particular loss of a Handshake packet from the server, can cause a situation in which the server cannot send when the client has no data to send and the anti-amplification limit is reached. In order to avoid this causing a -handshake deadlock, clients SHOULD send a packet upon a crypto retransmission -timeout, as described in {{QUIC-RECOVERY}}. If the client has no data to -retransmit and does not have Handshake keys, it SHOULD send an Initial packet in -a UDP datagram of at least 1200 bytes. If the client has Handshake keys, it -SHOULD send a Handshake packet. +handshake deadlock, clients MUST send a packet upon a probe timeout, as +described in {{QUIC-RECOVERY}}. If the client has no data to retransmit and does +not have Handshake keys, it MUST send an Initial packet in a UDP datagram of +at least 1200 bytes. If the client has Handshake keys, it SHOULD send a +Handshake packet. A server might wish to validate the client address before starting the cryptographic handshake. QUIC uses a token in the Initial packet to provide @@ -1623,6 +1654,14 @@ also constrained in what they can send by the limits set by the congestion controller. Clients are only constrained by the congestion controller. +### Token Construction + +A token sent in a NEW_TOKEN frames or a Retry packet MUST be constructed in a +way that allows the server to identity how it was provided to a client. These +tokens are carried in the same field, but require different handling from +servers. + + ### Address Validation using Retry Packets {#validate-retry} Upon receiving the client's Initial packet, the server can request address @@ -1636,10 +1675,25 @@ As long as it is not possible for an attacker to generate a valid token for its own address (see {{token-integrity}}) and the client is able to return that token, it proves to the server that it received the token. -A server can also use a Retry packet to defer the state and processing costs -of connection establishment. By giving the client a different connection ID to -use, a server can cause the connection to be routed to a server instance with -more resources available for new connections. +A server can also use a Retry packet to defer the state and processing costs of +connection establishment. Requiring the server to provide a different +connection ID, along with the original_connection_id transport parameter defined +in {{transport-parameter-definitions}}, forces the server to demonstrate that +it, or an entity it cooperates with, received the original Initial packet from +the client. Providing a different connection ID also grants a server some +control over how subsequent packets are routed. This can be used to direct +connections to a different server instance. + +If a server receives a client Initial that can be unprotected but contains an +invalid Retry token, it knows the client will not accept another Retry token. +The server can discard such a packet and allow the client to time out to +detect handshake failure, but that could impose a significant latency penalty on +the client. A server MAY proceed with the connection without verifying the +token, though the server MUST NOT consider the client address validated. If a +server chooses not to proceed with the handshake, it SHOULD immediately close +({{immediate-close}}) the connection with an INVALID_TOKEN error. Note that a +server has not established any state for the connection at this point and so +does not enter the closing period. A flow showing the use of a Retry packet is shown in {{fig-retry}}. @@ -1675,44 +1729,46 @@ one. The client MUST NOT use the token provided in a Retry for future connections. Servers MAY discard any Initial packet that does not carry the expected token. -A token SHOULD be constructed in a way that allows the server to distinguish it -from tokens that are sent in Retry packets as they are carried in the same -field. - -The token MUST NOT include information that would allow it to be linked by an -on-path observer to the connection on which it was issued. For example, it -cannot include the connection ID or addressing information unless the values are -encrypted. - -Unlike the token that is created for a Retry packet, there might be some time -between when the token is created and when the token is subsequently used. -Thus, a token SHOULD have an expiration time, which could be either an explicit -expiration time or an issued timestamp that can be used to dynamically calculate -the expiration time. A server can store the expiration time or include it in an -encrypted form in the token. +Unlike the token that is created for a Retry packet, which is used immediately, +the token sent in the NEW_TOKEN frame might be used after some period of +time has passed. Thus, a token SHOULD have an expiration time, which could +be either an explicit expiration time or an issued timestamp that can be +used to dynamically calculate the expiration time. A server can store the +expiration time or include it in an encrypted form in the token. + +A token issued with NEW_TOKEN MUST NOT include information that would allow +values to be linked by an on-path observer to the connection on which it was +issued, unless the values are encrypted. For example, it cannot include the +previous connection ID or addressing information. A server MUST ensure that +every NEW_TOKEN frame it sends is unique across all clients, with the exception +of those sent to repair losses of previously sent NEW_TOKEN frames. Information +that allows the server to distinguish between tokens from Retry and NEW_TOKEN +MAY be accessible to entities other than the server. It is unlikely that the client port number is the same on two different connections; validating the port is therefore unlikely to be successful. -If the client has a token received in a NEW_TOKEN frame on a previous connection -to what it believes to be the same server, it SHOULD include that value in the -Token field of its Initial packet. Including a token might allow the server to -validate the client address without an additional round trip. +A token received in a NEW_TOKEN frame is applicable to any server that the +connection is considered authoritative for (e.g., server names included in the +certificate). When connecting to a server for which the client retains an +applicable and unused token, it SHOULD include that token in the Token field of +its Initial packet. Including a token might allow the server to validate the +client address without an additional round trip. A client MUST NOT include a +token that is not applicable to the server that it is connecting to, unless the +client has the knowledge that the server that issued the token and the server +the client is connecting to are jointly managing the tokens. A client MAY use a +token from any previous connection to that server. A token allows a server to correlate activity between the connection where the token was issued and any connection where it is used. Clients that want to break continuity of identity with a server MAY discard tokens provided using the -NEW_TOKEN frame. A token obtained in a Retry packet MUST be used immediately -during the connection attempt and cannot be used in subsequent connection -attempts. - -A client SHOULD NOT reuse a token in different connections. Reusing a token -allows connections to be linked by entities on the network path; see -{{migration-linkability}}. A client MUST NOT reuse a token if it believes that -its point of network attachment has changed since the token was last used; that -is, if there is a change in its local IP address or network interface. A client -needs to start the connection process over if there is any change in its local -address prior to completing the handshake. +NEW_TOKEN frame. In comparison, a token obtained in a Retry packet MUST be used +immediately during the connection attempt and cannot be used in subsequent +connection attempts. + +A client SHOULD NOT reuse a NEW_TOKEN token for different connection attempts. +Reusing a token allows connections to be linked by entities on the network path; +see {{migration-linkability}}. Clients might receive multiple tokens on a single connection. Aside from preventing linkability, any token can be used in any connection attempt. @@ -1748,6 +1804,12 @@ able to reuse a token. To avoid attacks that exploit this property, a server can limit its use of tokens to only the information needed to validate client addresses. +Clients MAY use tokens obtained on one connection for any connection attempt +using the same version. When selecting a token to use, clients do not need to +consider other properties of the connection that is being attempted, including +the choice of possible application protocols, session tickets, or other +connection properties. + Attackers could replay tokens to use servers as amplifiers in DDoS attacks. To protect against such attacks, servers SHOULD ensure that tokens sent in Retry packets are only accepted for a short time. Tokens that are provided in @@ -1818,7 +1880,7 @@ To initiate path validation, an endpoint sends a PATH_CHALLENGE frame containing a random payload on the path to be validated. An endpoint MAY send multiple PATH_CHALLENGE frames to guard against packet -loss, however an endpoint SHOULD NOT send multiple PATH_CHALLENGE frames in a +loss. However, an endpoint SHOULD NOT send multiple PATH_CHALLENGE frames in a single packet. An endpoint SHOULD NOT send a PATH_CHALLENGE more frequently than it would an Initial packet, ensuring that connection migration is no more load on a new path than establishing a new connection. @@ -1929,7 +1991,7 @@ local address. Failure of path validation simply means that the new path is not usable for this connection. Failure to validate a path does not cause the connection to end unless there are no valid alternative paths available. -An endpoint uses a new connection ID for probes sent from a new local address, +An endpoint uses a new connection ID for probes sent from a new local address; see {{migration-linkability}} for further discussion. An endpoint that uses a new local address needs to ensure that at least one new connection ID is available at the peer. That can be achieved by including a NEW_CONNECTION_ID @@ -1981,7 +2043,10 @@ to verify the peer's ownership of the unvalidated address. An endpoint MAY send data to an unvalidated peer address, but it MUST protect against potential attacks as described in {{address-spoofing}} and {{on-path-spoofing}}. An endpoint MAY skip validation of a peer address if that -address has been seen recently. +address has been seen recently. In particular, if an endpoint returns to a +previously-validated path after detecting some form of spurious migration, +skipping address validation and restoring loss detection and congestion state +can reduce the performance impact of the attack. An endpoint only changes the address that it sends packets to in response to the highest-numbered non-probing packet. This ensures that an endpoint does not send @@ -2096,7 +2161,7 @@ more likely to indicate an intentional migration rather than an attack. ## Loss Detection and Congestion Control {#migration-cc} The capacity available on the new path might not be the same as the old path. -Packets sent on the old path SHOULD NOT contribute to congestion control or RTT +Packets sent on the old path MUST NOT contribute to congestion control or RTT estimation for the new path. On confirming a peer's ownership of its new address, an endpoint MUST @@ -2125,9 +2190,11 @@ path is no longer needed (such as the case in {{off-path-forward}}). A sender can make exceptions for probe packets so that their loss detection is independent and does not unduly cause the congestion controller to reduce its sending rate. An endpoint might set a separate timer when a PATH_CHALLENGE is -sent, which is cancelled when the corresponding PATH_RESPONSE is received. If +sent, which is cancelled if the corresponding PATH_RESPONSE is received. If the timer fires before the PATH_RESPONSE is received, the endpoint might send a new PATH_CHALLENGE, and restart the timer for a longer period of time. +This timer SHOULD be set as described in Section 5.3 of {{QUIC-RECOVERY}} and +MUST NOT be more aggressive. ## Privacy Implications of Connection Migration {#migration-linkability} @@ -2137,7 +2204,7 @@ to correlate activity between those paths. An endpoint that moves between networks might not wish to have their activity correlated by any entity other than their peer, so different connection IDs are used when sending from different local addresses, as discussed in {{connection-id}}. For this to be -effective endpoints need to ensure that connections IDs they provide cannot be +effective endpoints need to ensure that connection IDs they provide cannot be linked by any other entity. At any time, endpoints MAY change the Destination Connection ID they send to a @@ -2344,46 +2411,60 @@ source address. ## Idle Timeout {#idle-timeout} -If the idle timeout is enabled, a connection is silently closed and the state is -discarded when it remains idle for longer than both the advertised -idle timeout (see {{transport-parameter-definitions}}) and three times the -current Probe Timeout (PTO). +If the idle timeout is enabled by either peer, a connection is silently closed +and its state is discarded when it remains idle for longer than the minimum of +the max_idle_timeouts (see {{transport-parameter-definitions}}) and three times +the current Probe Timeout (PTO). -Each endpoint advertises its own idle timeout to its peer. An endpoint -restarts any timer it maintains when a packet from its peer is received and -processed successfully. The timer is also restarted when sending a packet -containing frames other than ACK or PADDING (an ack-eliciting packet; see -{{QUIC-RECOVERY}}), but only if no other ack-eliciting packets have been sent -since last receiving a packet. Restarting when sending packets ensures that -connections do not prematurely time out when initiating new activity. +Each endpoint advertises a max_idle_timeout, but the effective value +at an endpoint is computed as the minimum of the two advertised values. By +announcing a max_idle_timeout, an endpoint commits to initiating an immediate +close ({{immediate-close}}) if it abandons the connection prior to the effective +value. + +An endpoint restarts its idle timer when a packet from its peer is received +and processed successfully. The idle timer is also restarted when sending +an ack-eliciting packet (see {{QUIC-RECOVERY}}), but only if no other +ack-eliciting packets have been sent since last receiving a packet. Restarting +when sending packets ensures that connections do not prematurely time out when +initiating new activity. An endpoint might need to send packets to avoid an +idle timeout if it is unable to send application data due to being blocked on +flow control limits; see {{flow-control}}. -The value for an idle timeout can be asymmetric. The value advertised by an -endpoint is only used to determine whether the connection is live at that -endpoint. An endpoint that sends packets near the end of the idle timeout -period of a peer risks having those packets discarded if its peer enters the -draining state before the packets arrive. If a peer could timeout within a -Probe Timeout (PTO; see Section 6.3 of {{QUIC-RECOVERY}}), it is advisable to -test for liveness before sending any data that cannot be retried safely. Note -that it is likely that only applications or application protocols will -know what information can be retried. +An endpoint that sends packets near the end of the idle timeout period +risks having those packets discarded if its peer enters the draining state +before the packets arrive. If a peer could time out within a Probe Timeout +(PTO; see Section 6.6 of {{QUIC-RECOVERY}}), it is advisable to test for +liveness before sending any data that cannot be retried safely. Note that it +is likely that only applications or application protocols will know what +information can be retried. -## Immediate Close +## Immediate Close {#immediate-close} An endpoint sends a CONNECTION_CLOSE frame ({{frame-connection-close}}) to terminate the connection immediately. A CONNECTION_CLOSE frame causes all streams to immediately become closed; open streams can be assumed to be implicitly reset. -After sending a CONNECTION_CLOSE frame, endpoints immediately enter the closing -state. During the closing period, an endpoint that sends a CONNECTION_CLOSE -frame SHOULD respond to any packet that it receives with another packet -containing a CONNECTION_CLOSE frame. To minimize the state that an endpoint -maintains for a closing connection, endpoints MAY send the exact same packet. -However, endpoints SHOULD limit the number of packets they generate containing a -CONNECTION_CLOSE frame. For instance, an endpoint could progressively increase -the number of packets that it receives before sending additional packets or -increase the time between packets. +After sending a CONNECTION_CLOSE frame, an endpoint immediately enters the +closing state. + +During the closing period, an endpoint that sends a CONNECTION_CLOSE frame +SHOULD respond to any incoming packet that can be decrypted with another packet +containing a CONNECTION_CLOSE frame. Such an endpoint SHOULD limit the number +of packets it generates containing a CONNECTION_CLOSE frame. For instance, an +endpoint could wait for a progressively increasing number of received packets or +amount of time before responding to a received packet. + +An endpoint is allowed to drop the packet protection keys when entering the +closing period ({{draining}}) and send a packet containing a CONNECTION_CLOSE in +response to any UDP datagram that is received. However, an endpoint without the +packet protection keys cannot identify and discard invalid packets. To avoid +creating an unwitting amplification attack, such endpoints MUST reduce the +frequency with which it sends packets containing a CONNECTION_CLOSE frame. To +minimize the state that an endpoint maintains for a closing connection, +endpoints MAY send the exact same packet. Note: @@ -2414,25 +2495,42 @@ the application requests that the connection be closed. The application protocol can use a CONNECTION_CLOSE frame with an appropriate error code to signal closure. -When sending CONNECTION_CLOSE, the goal is to ensure that the peer will process -the frame. Generally, this means sending the frame in a packet with the highest -level of packet protection to avoid the packet being discarded. However, during -the handshake, it is possible that more advanced packet protection keys are not -available to the peer, so the frame MAY be replicated in a packet that uses a -lower packet protection level. -After the handshake is confirmed, an endpoint MUST send any CONNECTION_CLOSE -frames in a 1-RTT packet. Prior to handshake confirmation, the peer might not -have 1-RTT keys, so the endpoint SHOULD send CONNECTION_CLOSE frames in a -Handshake packet. If the endpoint does not have Handshake keys, it SHOULD send -CONNECTION_CLOSE frames in an Initial packet. +### Immediate Close During the Handshake {#immediate-close-hs} -A client will always know whether the server has Handshake keys -(see {{discard-initial}}), but it is possible that a server does not know -whether the client has Handshake keys. Under these circumstances, a server -SHOULD send a CONNECTION_CLOSE frame in both Handshake and Initial packets -to ensure that at least one of them is processable by the client. These -packets can be coalesced into a single UDP datagram (see {{packet-coalesce}}). +When sending CONNECTION_CLOSE, the goal is to ensure that the peer will process +the frame. Generally, this means sending the frame in a packet with the highest +level of packet protection to avoid the packet being discarded. After the +handshake is confirmed (see Section 4.1.2 of {{QUIC-TLS}}), an endpoint MUST +send any CONNECTION_CLOSE frames in a 1-RTT packet. However, prior to +confirming the handshake, it is possible that more advanced packet protection +keys are not available to the peer, so the frame MAY be replicated in a packet +that uses a lower packet protection level. + +A client will always know whether the server has Handshake keys (see +{{discard-initial}}), but it is possible that a server does not know whether the +client has Handshake keys. Under these circumstances, a server SHOULD send a +CONNECTION_CLOSE frame in both Handshake and Initial packets to ensure that at +least one of them is processable by the client. Similarly, a peer might be +unable to read 1-RTT packets, so an endpoint SHOULD send CONNECTION_CLOSE in +Handshake and 1-RTT packets prior to confirming the handshake. These packets +can be coalesced into a single UDP datagram; see {{packet-coalesce}}. + +An endpoint might send a CONNECTION_CLOSE frame in an Initial packet or in +response to unauthenticated information received in Initial or Handshake +packets. Such an immediate close might expose legitimate connections to a +denial of service. QUIC does not include defensive measures for on-path attacks +during the handshake; see {{handshake-dos}}. However, at the cost of reducing +feedback about errors for legitimate peers, some forms of denial of service can +be made more difficult for an attacker if endpoints discard illegal packets +rather than terminating a connection with CONNECTION_CLOSE. For this reason, +endpoints MAY discard packets rather than immediately close if errors are +detected in packets that lack authentication. + +An endpoint that has not established state, such as a server that detects an +error in an Initial packet, does not enter the closing state. An endpoint that +has no state for the connection does not enter a closing or draining period on +sending a CONNECTION_CLOSE frame. ## Stateless Reset {#stateless-reset} @@ -2558,20 +2656,38 @@ the packet other than the last 16 bytes for carrying data. ### Detecting a Stateless Reset -An endpoint detects a potential stateless reset when an incoming packet either -cannot be associated with a connection, cannot be decrypted, or is marked as a -duplicate packet. The endpoint MUST then compare the last 16 bytes of the -packet with all Stateless Reset Tokens corresponding to active connection IDs -that the endpoint has used for sending packets to the IP address and port on -which the datagram is received. This includes Stateless Reset Tokens from -NEW_CONNECTION_ID frames and the server's transport parameters. An endpoint -MUST NOT check for any Stateless Reset Tokens associated with connection IDs it -has not used or for connection IDs that have been retired. - -If the last 16 bytes of the packet values are identical to a Stateless Reset +An endpoint detects a potential stateless reset using the trailing 16 bytes of +the UDP datagram. An endpoint remembers all Stateless Reset Tokens associated +with the connection IDs and remote addresses for datagrams it has recently sent. +This includes Stateless Reset Tokens from NEW_CONNECTION_ID frames and the +server's transport parameters but excludes Stateless Reset Tokens associated +with connection IDs that are either unused or retired. The endpoint identifies +a received datagram as a stateless reset by comparing the last 16 bytes of the +datagram with all Stateless Reset Tokens associated with the remote address on +which the datagram was received. + +This comparison can be performed for every inbound datagram. Endpoints MAY skip +this check if any packet from a datagram is successfully processed. However, +the comparison MUST be performed when the first packet in an incoming datagram +either cannot be associated with a connection, or cannot be decrypted. + +An endpoint MUST NOT check for any Stateless Reset Tokens associated with +connection IDs it has not used or for connection IDs that have been retired. + +When comparing a datagram to Stateless Reset Token values, endpoints MUST +perform the comparison without leaking information about the value of the token. +For example, performing this comparison in constant time protects the value of +individual Stateless Reset Tokens from information leakage through timing side +channels. Another approach would be to store and compare the transformed values +of Stateless Reset Tokens instead of the raw token values, where the +transformation is defined as a cryptographically-secure pseudo-random function +using a secret key (e.g., block cipher, HMAC {{?RFC2104}}). An endpoint is not +expected to protect information about whether a packet was successfully +decrypted, or the number of valid Stateless Reset Tokens. + +If the last 16 bytes of the datagram are identical in value to a Stateless Reset Token, the endpoint MUST enter the draining period and not send any further -packets on this connection. If the comparison fails, the packet can be -discarded. +packets on this connection. ### Calculating a Stateless Reset Token {#reset-token} @@ -2693,9 +2809,6 @@ frame risks a peer missing the first such packet. The only mechanism available to an endpoint that continues to receive data for a terminated connection is to use the stateless reset process ({{stateless-reset}}). -An endpoint that receives an invalid CONNECTION_CLOSE frame MUST NOT signal the -existence of the error to its peer. - ## Stream Errors @@ -2704,17 +2817,18 @@ connection in a recoverable state, the endpoint can send a RESET_STREAM frame ({{frame-reset-stream}}) with an appropriate error code to terminate just the affected stream. -RESET_STREAM MUST be instigated by the protocol using QUIC. RESET_STREAM -carries an application error code. Only the application protocol is able to +Resetting a stream without the involvement of the application protocol could +cause the application protocol to enter an unrecoverable state. RESET_STREAM +MUST only be instigated by the application protocol that uses QUIC. + +The semantics of the application error code carried in RESET_STREAM are +defined by the application protocol. Only the application protocol is able to cause a stream to be terminated. A local instance of the application protocol uses a direct API call and a remote instance uses the STOP_SENDING frame, which triggers an automatic RESET_STREAM. -Resetting a stream without knowledge of the application protocol could cause the -protocol to enter an unrecoverable state. Application protocols might require -certain streams to be reliably delivered in order to guarantee consistent state -between endpoints. Application protocols SHOULD define rules for handling -streams that are prematurely cancelled by either endpoint. +Application protocols SHOULD define rules for handling streams that are +prematurely cancelled by either endpoint. # Packets and Frames {#packets-frames} @@ -2736,10 +2850,11 @@ available. ## Protected Packets {#packet-protected} -All QUIC packets except Version Negotiation and Retry packets use authenticated +All QUIC packets except Version Negotiation packets use authenticated encryption with additional data (AEAD) {{!RFC5116}} to provide confidentiality -and integrity protection. Details of packet protection are found in -{{QUIC-TLS}}; this section includes an overview of the process. +and integrity protection. Retry packets use AEAD to provide integrity +protection. Details of packet protection are found in {{QUIC-TLS}}; this +section includes an overview of the process. Initial packets are protected using keys that are statically derived. This packet protection is not effective confidentiality protection. Initial @@ -2903,29 +3018,59 @@ CONNECTION_CLOSE frames is used to carry other frame-specific flags. For all other frames, the Frame Type field simply identifies the frame. These frames are explained in more detail in {{frame-formats}}. -| Type Value | Frame Type Name | Definition | -|:------------|:---------------------|:-------------------------------| -| 0x00 | PADDING | {{frame-padding}} | -| 0x01 | PING | {{frame-ping}} | -| 0x02 - 0x03 | ACK | {{frame-ack}} | -| 0x04 | RESET_STREAM | {{frame-reset-stream}} | -| 0x05 | STOP_SENDING | {{frame-stop-sending}} | -| 0x06 | CRYPTO | {{frame-crypto}} | -| 0x07 | NEW_TOKEN | {{frame-new-token}} | -| 0x08 - 0x0f | STREAM | {{frame-stream}} | -| 0x10 | MAX_DATA | {{frame-max-data}} | -| 0x11 | MAX_STREAM_DATA | {{frame-max-stream-data}} | -| 0x12 - 0x13 | MAX_STREAMS | {{frame-max-streams}} | -| 0x14 | DATA_BLOCKED | {{frame-data-blocked}} | -| 0x15 | STREAM_DATA_BLOCKED | {{frame-stream-data-blocked}} | -| 0x16 - 0x17 | STREAMS_BLOCKED | {{frame-streams-blocked}} | -| 0x18 | NEW_CONNECTION_ID | {{frame-new-connection-id}} | -| 0x19 | RETIRE_CONNECTION_ID | {{frame-retire-connection-id}} | -| 0x1a | PATH_CHALLENGE | {{frame-path-challenge}} | -| 0x1b | PATH_RESPONSE | {{frame-path-response}} | -| 0x1c - 0x1d | CONNECTION_CLOSE | {{frame-connection-close}} | +| Type Value | Frame Type Name | Definition | Packets | +|:------------|:---------------------|:-------------------------------|---------| +| 0x00 | PADDING | {{frame-padding}} | IH01 | +| 0x01 | PING | {{frame-ping}} | IH01 | +| 0x02 - 0x03 | ACK | {{frame-ack}} | IH_1 | +| 0x04 | RESET_STREAM | {{frame-reset-stream}} | __01 | +| 0x05 | STOP_SENDING | {{frame-stop-sending}} | __01 | +| 0x06 | CRYPTO | {{frame-crypto}} | IH_1 | +| 0x07 | NEW_TOKEN | {{frame-new-token}} | ___1 | +| 0x08 - 0x0f | STREAM | {{frame-stream}} | __01 | +| 0x10 | MAX_DATA | {{frame-max-data}} | __01 | +| 0x11 | MAX_STREAM_DATA | {{frame-max-stream-data}} | __01 | +| 0x12 - 0x13 | MAX_STREAMS | {{frame-max-streams}} | __01 | +| 0x14 | DATA_BLOCKED | {{frame-data-blocked}} | __01 | +| 0x15 | STREAM_DATA_BLOCKED | {{frame-stream-data-blocked}} | __01 | +| 0x16 - 0x17 | STREAMS_BLOCKED | {{frame-streams-blocked}} | __01 | +| 0x18 | NEW_CONNECTION_ID | {{frame-new-connection-id}} | __01 | +| 0x19 | RETIRE_CONNECTION_ID | {{frame-retire-connection-id}} | __01 | +| 0x1a | PATH_CHALLENGE | {{frame-path-challenge}} | __01 | +| 0x1b | PATH_RESPONSE | {{frame-path-response}} | __01 | +| 0x1c - 0x1d | CONNECTION_CLOSE | {{frame-connection-close}} | IH_1* | +| 0x1e | HANDSHAKE_DONE | {{frame-handshake-done}} | ___1 | {: #frame-types title="Frame Types"} +The "Packets" column in {{frame-types}} does not form part of the IANA registry +(see {{iana-frames}}). This column lists the types of packets that each +frame type can appear in, indicated by the following characters: + +I: + +: Initial ({{packet-initial}}) + +H: + +: Handshake ({{packet-handshake}}) + +0: + +: 0-RTT ({{packet-0rtt}}) + +1: + +: 1-RTT ({{short-header}}) + +*: + +: A CONNECTION_CLOSE frame of type 0x1c can appear in Initial, Handshake, and +1-RTT packets, whereas a CONNECTION_CLOSE of type 0x1d can only appear in a +1-RTT packet. + +Section 4 of {{QUIC-TLS}} provides more detail about these restrictions. Note +that all frames can appear in 1-RTT packets. + An endpoint MUST treat the receipt of a frame of unknown type as a connection error of type FRAME_ENCODING_ERROR. @@ -2936,13 +3081,14 @@ than once. The Frame Type field uses a variable length integer encoding (see {{integer-encoding}}) with one exception. To ensure simple and efficient implementations of frame parsing, a frame type MUST use the shortest possible -encoding. Though a two-, four- or eight-byte encoding of the frame types -defined in this document is possible, the Frame Type field for these frames is -encoded on a single byte. For instance, though 0x4001 is a legitimate two-byte -encoding for a variable-length integer with a value of 1, PING frames are always -encoded as a single byte with the value 0x01. An endpoint MAY treat the receipt -of a frame type that uses a longer encoding than necessary as a connection error -of type PROTOCOL_VIOLATION. +encoding. For frame types defined in this document, this means a single-byte +encoding, even though it is possible to encode these values as a two-, four- +or eight-byte variable length integer. For instance, though 0x4001 is +a legitimate two-byte encoding for a variable-length integer with a value +of 1, PING frames are always encoded as a single byte with the value 0x01. +This rule applies to all current and future QUIC frame types. An endpoint +MAY treat the receipt of a frame type that uses a longer encoding than +necessary as a connection error of type PROTOCOL_VIOLATION. # Packetization and Reliability {#packetization} @@ -2992,8 +3138,8 @@ valid frames? --> ## Generating Acknowledgements {#generating-acks} Endpoints acknowledge all packets they receive and process. However, only -ack-eliciting packets (see {{QUIC-RECOVERY}}) trigger the sending of an ACK -frame. Packets that are not ack-eliciting are only acknowledged when an ACK +ack-eliciting packets cause an ACK frame to be sent within the maximum ack +delay. Packets that are not ack-eliciting are only acknowledged when an ACK frame is sent for other reasons. When sending a packet for any reason, an endpoint should attempt to bundle an @@ -3007,25 +3153,26 @@ guidance offered below seeks to strike this balance. ### Sending ACK Frames {#sending-acknowledgements} +Every packet SHOULD be acknowledged at least once, and ack-eliciting packets +MUST be acknowledged at least once within the maximum ack delay. An endpoint +communicates its maximum delay using the max_ack_delay transport parameter; +see {{transport-parameter-definitions}}. max_ack_delay declares an explicit +contract: an endpoint promises to never intentionally delay acknowledgments +of an ack-eliciting packet by more than the indicated value. If it does, +any excess accrues to the RTT estimate and could result in spurious or +delayed retransmissions from the peer. For Initial and Handshake packets, +a max_ack_delay of 0 is used. The sender uses the receiver's `max_ack_delay` +value in determining timeouts for timer-based retransmission, as detailed in +Section 5.2.1 of {{QUIC-RECOVERY}}. + An ACK frame SHOULD be generated for at least every second ack-eliciting packet. This recommendation is in keeping with standard practice for TCP {{?RFC5681}}. -An endpoint MUST NOT excessively delay acknowledgements of ack-eliciting -packets. An endpoint commits to a maximum delay using the max_ack_delay -transport parameter; see {{transport-parameter-definitions}}. max_ack_delay -declares an explicit contract: an endpoint promises to never delay -acknowledgments of an ack-eliciting packet by more than the indicated value. If -it does, any excess accrues to the RTT estimate and could result in delayed -retransmissions from the peer. For Initial and Handshake packets, a -max_ack_delay of 0 is used. The sender uses the receiver's `max_ack_delay` -value in determining timeouts for timer-based retransmission, as detailed -in Section 5.2.1 of {{QUIC-RECOVERY}}. - In order to assist loss detection at the sender, an endpoint SHOULD send an ACK frame immediately on receiving an ack-eliciting packet that is out of order. The endpoint MAY continue sending ACK frames immediately on each subsequently received packet, but the endpoint SHOULD return to acknowledging every other -packet after a period of 1/8 x RTT, unless more ack-eliciting packets are +packet within a period of 1/8 x RTT, unless more ack-eliciting packets are received out of order. If every subsequent ack-eliciting packet arrives out of order, then an ACK frame SHOULD be sent immediately for every received ack-eliciting packet. @@ -3041,10 +3188,10 @@ incoming packets. Packets containing PADDING frames are considered to be in flight for congestion control purposes {{QUIC-RECOVERY}}. Sending only PADDING frames might cause the -sender to become limited by the congestion controller (as described in -{{QUIC-RECOVERY}}) with no acknowledgments forthcoming from the -receiver. Therefore, a sender SHOULD ensure that other frames are sent in -addition to PADDING frames to elicit acknowledgments from the receiver. +sender to become limited by the congestion controller with no acknowledgments +forthcoming from the receiver. Therefore, a sender SHOULD ensure that other +frames are sent in addition to PADDING frames to elicit acknowledgments from +the receiver. An endpoint that is only sending ACK frames will not receive acknowledgments from its peer unless those acknowledgements are included in packets with @@ -3061,14 +3208,13 @@ of doing so. Packets containing only ACK frames are not congestion controlled, so there are limits on how frequently they can be sent. An endpoint MUST NOT send more than -one ACK-frame-only packet in response to receiving an ack-eliciting packet (one -containing frames other than ACK and/or PADDING). An endpoint MUST NOT send a -packet containing only an ACK frame in response to a non-ack-eliciting packet -(one containing only ACK and/or PADDING frames), even if there are packet gaps -which precede the received packet. Limiting ACK frames avoids an infinite -feedback loop of acknowledgements, which could prevent the connection from ever -becoming idle. However, the endpoint acknowledges non-ack-eliciting packets when -it sends an ACK frame. +one ACK-frame-only packet in response to receiving an ack-eliciting packet. An +endpoint MUST NOT send a non-ack-eliciting packet in response to a +non-ack-eliciting packet, even if there are packet gaps which precede the +received packet. Limiting ACK frames avoids an infinite feedback loop of +acknowledgements, which could prevent the connection from ever becoming idle. +However, the endpoint acknowledges non-ACK-eliciting packets when it sends an +ACK frame. An endpoint SHOULD treat receipt of an acknowledgment for a packet it did not send as a connection error of type PROTOCOL_VIOLATION, if it is able to detect @@ -3083,7 +3229,9 @@ caused by losing previously sent ACK frames, at the cost of larger ACK frames. ACK frames SHOULD always acknowledge the most recently received packets, and the more out-of-order the packets are, the more important it is to send an updated ACK frame quickly, to prevent the peer from declaring a packet as lost and -spuriously retransmitting the frames it contains. +spuriously retransmitting the frames it contains. An ACK frame is expected +to fit within a single QUIC packet. If it does not, then older ranges +(those with the smallest packet numbers) are omitted. {{ack-tracking}} and {{ack-limiting}} describe an exemplary approach for determining what packets to acknowledge in each ACK frame. @@ -3127,15 +3275,15 @@ received packets in preference to packets received in the past. ### Measuring and Reporting Host Delay {#host-delay} -An endpoint measures the delays intentionally introduced between when an -ack-eliciting packet is received and the corresponding acknowledgment is sent. -The endpoint encodes this delay for the largest acknowledged packet in the Ack -Delay field of an ACK frame (see {{frame-ack}}). This allows the receiver of the -ACK to adjust for any intentional delays, which is important for getting a -better estimate of the path RTT when acknowledgments are delayed. A packet might -be held in the OS kernel or elsewhere on the host before being processed. An -endpoint MUST NOT include delays that is does not control when populating the -Ack Delay field in an ACK frame. +An endpoint measures the delays intentionally introduced between the time +the packet with the largest packet number is received and the time an +acknowledgment is sent. The endpoint encodes this delay in the Ack Delay +field of an ACK frame (see {{frame-ack}}). This allows the receiver of the ACK +to adjust for any intentional delays, which is important for getting a better +estimate of the path RTT when acknowledgments are delayed. A packet might +be held in the OS kernel or elsewhere on the host before being processed. +An endpoint MUST NOT include delays that it does not control when populating +the Ack Delay field in an ACK frame. ### ACK Frames and Packet Protection @@ -3172,9 +3320,11 @@ containing that information is acknowledged. unless the endpoint has sent a RESET_STREAM for that stream. Once an endpoint sends a RESET_STREAM frame, no further STREAM frames are needed. -* The most recent set of acknowledgments are sent in ACK frames. An ACK frame - SHOULD contain all unacknowledged acknowledgments, as described in - {{sending-acknowledgements}}. +* ACK frames carry the most recent set of acknowledgements and the Ack Delay + from the largest acknowledged packet, as described in + {{sending-acknowledgements}}. Delaying the transmission of packets + containing ACK frames or sending old ACK frames can cause the peer to + generate an inflated RTT sample or unnecessarily disable ECN. * Cancellation of stream transmission, as carried in a RESET_STREAM frame, is sent until acknowledged or until all stream data is acknowledged by the peer @@ -3240,15 +3390,19 @@ containing that information is acknowledged. * PING and PADDING frames contain no information, so lost PING or PADDING frames do not require repair. +* The HANDSHAKE_DONE frame MUST be retransmitted until it is acknowledged. + Endpoints SHOULD prioritize retransmission of data over sending new data, unless priorities specified by the application indicate otherwise (see {{stream-prioritization}}). Even though a sender is encouraged to assemble frames containing up-to-date information every time it sends a packet, it is not forbidden to retransmit -copies of frames from lost packets. A receiver MUST accept packets containing -an outdated frame, such as a MAX_DATA frame carrying a smaller maximum data than -one found in an older packet. +copies of frames from lost packets. A sender that retransmits copies of frames +needs to handle decreases in available payload size due to change in packet +number length, connection ID length, and path MTU. A receiver MUST accept +packets containing an outdated frame, such as a MAX_DATA frame carrying a +smaller maximum data than one found in an older packet. Upon detecting losses, a sender MUST take appropriate congestion control action. The details of loss detection and congestion control are described in @@ -3312,12 +3466,14 @@ errors are detected. Endpoints validate ECN for packets sent on each network path independently. An endpoint thus validates ECN on new connection establishment, when switching to a new server preferred address, and on active connection migration to a new path. +{{ecn-alg}} describes one possible algorithm for testing paths for ECN support. Even if an endpoint does not use ECN markings on packets it transmits, the endpoint MUST provide feedback about ECN markings received from the peer if they are accessible. Failing to report ECN counts will cause the peer to disable ECN marking. + #### Sending ECN Markings To start ECN validation, an endpoint SHOULD do the following when sending @@ -3340,7 +3496,7 @@ marking strategies. Implementations can also use the ECT(1) codepoint, as specified in {{?RFC8311}}. -#### Receiving ACK Frames +#### Receiving ACK Frames {#ecn-ack} An endpoint that sets ECT(0) or ECT(1) codepoints on packets it transmits MUST use the following steps on receiving an ACK frame to validate ECN. @@ -3372,6 +3528,7 @@ to be greater than the number of packets acknowledged in an ACK frame. When this happens, and if validation succeeds, the local reference counts MUST be increased to match the counts in the ACK frame. + #### Validation Outcomes If validation fails, then the endpoint stops sending ECN markings in subsequent @@ -3392,27 +3549,25 @@ later time in the connection. The QUIC packet size includes the QUIC header and protected payload, but not the UDP or IP header. -Clients MUST ensure they send the first Initial packet in a single IP packet. -Similarly, the first Initial packet sent after receiving a Retry packet MUST be -sent in a single IP packet. - -The payload of a UDP datagram carrying the first Initial packet MUST be expanded -to at least 1200 bytes, by adding PADDING frames to the Initial packet and/or by +A client MUST expand the payload of all UDP datagrams carrying Initial packets +to at least 1200 bytes, by adding PADDING frames to the Initial packet or by coalescing the Initial packet (see {{packet-coalesce}}). Sending a UDP datagram -of this size ensures that the network path supports a reasonable Maximum -Transmission Unit (MTU), and helps reduce the amplitude of amplification attacks -caused by server responses toward an unverified client address; see -{{address-validation}}. +of this size ensures that the network path from the client to the server +supports a reasonable Maximum Transmission Unit (MTU). Padding datagrams also +helps reduce the amplitude of amplification attacks caused by server responses +toward an unverified client address; see {{address-validation}}. -The datagram containing the first Initial packet from a client MAY exceed 1200 -bytes if the client believes that the Path Maximum Transmission Unit (PMTU) -supports the size that it chooses. +Datagrams containing Initial packets MAY exceed 1200 bytes if the client +believes that the Path Maximum Transmission Unit (PMTU) supports the size that +it chooses. -A server MAY send a CONNECTION_CLOSE frame with error code PROTOCOL_VIOLATION in -response to the first Initial packet it receives from a client if the UDP -datagram is smaller than 1200 bytes. It MUST NOT send any other frame type in -response, or otherwise behave as if any part of the offending packet was -processed as valid. +UDP datagrams MUST NOT be fragmented at the IP layer. In IPv4 +{{!IPv4=RFC0791}}, the DF bit MUST be set to prevent fragmentation on the path. + +A server MUST discard an Initial packet that is carried in a UDP datagram that +is smaller than 1200 bytes. A server MAY also immediately close the connection +by sending a CONNECTION_CLOSE frame with an error code of PROTOCOL_VIOLATION; +see {{immediate-close-hs}}. The server MUST also limit the number of bytes it sends before validating the address of the client; see {{address-validation}}. @@ -3499,10 +3654,10 @@ algorithm determines that the quoted packet has actually been lost. ## Datagram Packetization Layer PMTU Discovery -Section 6.4 of {{!DPLPMTUD}} provides considerations for implementing Datagram +Section 6.3 of {{!DPLPMTUD}} provides considerations for implementing Datagram Packetization Layer PMTUD (DPLPMTUD) with QUIC. -When implementing the algorithm in Section 5.3 of {{!DPLPMTUD}}, the initial +When implementing the algorithm in Section 5 of {{!DPLPMTUD}}, the initial value of BASE_PMTU SHOULD be consistent with the minimum QUIC packet size (1232 bytes for IPv6 and 1252 bytes for IPv4). @@ -3556,10 +3711,10 @@ version negotiation to be exercised. That is, any version number where the low four bits of all bytes is 1010 (in binary). A client or server MAY advertise support for any of these reserved versions. -Reserved version numbers will probably never represent a real protocol; a client -MAY use one of these version numbers with the expectation that the server will -initiate version negotiation; a server MAY advertise support for one of these -versions and can expect that clients ignore the value. +Reserved version numbers will never represent a real protocol; a client MAY use +one of these version numbers with the expectation that the server will initiate +version negotiation; a server MAY advertise support for one of these versions +and can expect that clients ignore the value. \[\[RFC editor: please remove the remainder of this section before publication.]] @@ -3681,8 +3836,8 @@ Example pseudo-code for packet number decoding can be found in {: #fig-long-header title="Long Header Packet Format"} Long headers are used for packets that are sent prior to the establishment -of 1-RTT keys. Once both conditions are -met, a sender switches to sending packets using the short header +of 1-RTT keys. Once 1-RTT keys are available, +a sender switches to sending packets using the short header ({{short-header}}). The long form allows for special packets - such as the Version Negotiation packet - to be represented in this uniform fixed-length packet format. Packets that use the long header contain the following fields: @@ -3944,22 +4099,21 @@ server may send multiple Initial packets. The cryptographic key exchange could require multiple round trips or retransmissions of this data. The payload of an Initial packet includes a CRYPTO frame (or frames) containing -a cryptographic handshake message, ACK frames, or both. PADDING and +a cryptographic handshake message, ACK frames, or both. PING, PADDING, and CONNECTION_CLOSE frames are also permitted. An endpoint that receives an Initial packet containing other frames can either discard the packet as spurious or treat it as a connection error. The first packet sent by a client always includes a CRYPTO frame that contains -the entirety of the first cryptographic handshake message. This packet, and the -cryptographic handshake message, MUST fit in a single UDP datagram (see -{{handshake}}). The first CRYPTO frame sent always begins at an offset of 0 -(see {{handshake}}). +the start or all of the first cryptographic handshake message. The first +CRYPTO frame sent always begins at an offset of 0 (see {{handshake}}). + +Note that if the server sends a HelloRetryRequest, the client will send another +series of Initial packets. These Initial packets will continue the +cryptographic handshake and will contain CRYPTO frames starting at an offset +matching the size of the CRYPTO frames sent in the first flight of Initial +packets. -Note that if the server sends a HelloRetryRequest, the client will send a second -Initial packet. This Initial packet will continue the cryptographic handshake -and will contain a CRYPTO frame with an offset matching the size of the CRYPTO -frame sent in the first Initial packet. Cryptographic handshake messages -subsequent to the first do not need to fit within a single UDP datagram. #### Abandoning Initial Packets {#discard-initial} @@ -3968,7 +4122,7 @@ first Handshake packet. A server stops sending and processing Initial packets when it receives its first Handshake packet. Though packets might still be in flight or awaiting acknowledgment, no further Initial packets need to be exchanged beyond this point. Initial packet protection keys are discarded (see -Section 4.9.1 of {{QUIC-TLS}}) along with any loss recovery and congestion +Section 4.10.1 of {{QUIC-TLS}}) along with any loss recovery and congestion control state (see Section 6.5 of {{QUIC-RECOVERY}}). Any data in CRYPTO frames is discarded - and no longer retransmitted - when @@ -3982,7 +4136,7 @@ Number Length bits. It is used to carry "early" data from the client to the server as part of the first flight, prior to handshake completion. As part of the TLS handshake, the server can accept or reject this early data. -See Section 2.3 of {{!TLS13}} for a discussion of 0-RTT data and its +See Section 2.3 of {{!TLS13=RFC8446}} for a discussion of 0-RTT data and its limitations. ~~~ @@ -4081,9 +4235,10 @@ includes the connection ID that the sender of the packet wishes to use (see Handshake packets are their own packet number space, and thus the first Handshake packet sent by a server contains a packet number of 0. -The payload of this packet contains CRYPTO frames and could contain PADDING, or -ACK frames. Handshake packets MAY contain CONNECTION_CLOSE frames. Endpoints -MUST treat receipt of Handshake packets with other frames as a connection error. +The payload of this packet contains CRYPTO frames and could contain PING, +PADDING, or ACK frames. Handshake packets MAY contain CONNECTION_CLOSE frames. +Endpoints MUST treat receipt of Handshake packets with other frames as a +connection error. Like Initial packets (see {{discard-initial}}), data in CRYPTO frames at the Handshake encryption level is discarded - and no longer retransmitted - when @@ -4111,37 +4266,31 @@ wishes to perform a retry (see {{validate-handshake}}). +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Source Connection ID (0..160) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| ODCID Len (8) | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| Original Destination Connection ID (0..160) ... -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Retry Token (*) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| | ++ + +| | ++ Retry Integrity Tag (128) + +| | ++ + +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ {: #retry-format title="Retry Packet"} A Retry packet (shown in {{retry-format}}) does not contain any protected fields. The value in the Unused field is selected randomly by the server. In - addition to the long header, it contains these additional fields: - -ODCID Len: - -: The ODCID Len contains the length in bytes of the Original Destination - Connection ID field that follows it. This length is encoded as a 8-bit - unsigned integer. In QUIC version 1, this value MUST NOT exceed 20 bytes. - Clients that receive a version 1 Retry Packet with a value larger than 20 MUST - drop the packet. - -Original Destination Connection ID: - -: The Original Destination Connection ID contains the value of the Destination - Connection ID from the Initial packet that this Retry is in response to. The - length of this field is given in ODCID Len. +addition to the long header, it contains these additional fields: Retry Token: : An opaque token that the server can use to validate the client's address. +Retry Integrity Tag: + +: See the Retry Packet Integrity section of {{QUIC-TLS}}. + @@ -4150,8 +4299,11 @@ the client included in the Source Connection ID of the Initial packet. The server includes a connection ID of its choice in the Source Connection ID field. This value MUST not be equal to the Destination Connection ID field of -the packet sent by the client. The client MUST use this connection ID in the -Destination Connection ID of subsequent packets that it sends. +the packet sent by the client. A client MUST discard a Retry packet that +contains a Source Connection ID field that is identical to the Destination +Connection ID field of its Initial packet. The client MUST use the value from +the Source Connection ID field of the Retry packet in the Destination Connection +ID field of subsequent packets that it sends. A server MAY send Retry packets in response to Initial and 0-RTT packets. A server can either discard or buffer 0-RTT packets that it receives. A server @@ -4163,10 +4315,11 @@ A client MUST accept and process at most one Retry packet for each connection attempt. After the client has received and processed an Initial or Retry packet from the server, it MUST discard any subsequent Retry packets that it receives. -Clients MUST discard Retry packets that contain an Original Destination -Connection ID field that does not match the Destination Connection ID from its -Initial packet. This prevents an off-path attacker from injecting a Retry -packet. +Clients MUST discard Retry packets that have a Retry Integrity Tag that cannot +be validated, see the Retry Packet Integrity section of {{QUIC-TLS}}. This +diminishes an off-path attacker's ability to inject a Retry packet and protects +against accidental corruption of Retry packets. A client MUST discard a Retry +packet with a zero-length Retry Token field. The client responds to a Retry packet with an Initial packet that includes the provided Retry Token to continue connection establishment. @@ -4197,9 +4350,8 @@ processing a Retry packet; {{packet-0rtt}} contains more information on this. A server acknowledges the use of a Retry packet for a connection using the original_connection_id transport parameter (see {{transport-parameter-definitions}}). If the server sends a Retry packet, it -MUST include the value of the Original Destination Connection ID field of the -Retry packet (that is, the Destination Connection ID field from the client's -first Initial packet) in the transport parameter. +MUST include the Destination Connection ID field from the client's first +Initial packet in the transport parameter. If the client received and processed a Retry packet, it MUST validate that the original_connection_id transport parameter is present and correct; otherwise, it @@ -4309,12 +4461,11 @@ support the spin bit MUST implement it as specified in this section. Each endpoint unilaterally decides if the spin bit is enabled or disabled for a connection. Implementations MUST allow administrators of clients and servers to disable the spin bit either globally or on a per-connection basis. Even when -the spin bit is not disabled by the administrator, implementations MUST disable -the spin bit for a given connection with a certain likelihood. The random -selection process SHOULD be designed such that on average the spin bit is -disabled for at least one eighth of network paths. The selection process -performed at the beginning of the connection SHOULD be applied for all paths -used by the connection. +the spin bit is not disabled by the administrator, endpoints MUST disable their +use of the spin bit for a random selection of at least one in every 16 network +paths, or for one in every 16 connection IDs. As each endpoint disables the +spin bit independently, this ensures that the spin bit signal is disabled on +approximately one in eight network paths. When the spin bit is disabled, endpoints MAY set the spin bit to any value, and MUST ignore any incoming value. It is RECOMMENDED that endpoints set the spin @@ -4347,42 +4498,46 @@ connection. # Transport Parameter Encoding {#transport-parameter-encoding} -The format of the transport parameters is the TransportParameters struct from -{{figure-transport-parameters}}. This is described using the presentation -language from Section 3 of {{!TLS13=RFC8446}}. +The `extension_data` field of the quic_transport_parameters extension defined in +{{QUIC-TLS}} contains the QUIC transport parameters. They are encoded as a +length-prefixed sequence of transport parameters, as shown in +{{transport-parameter-sequence}}: + +~~~ + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Sequence Length (16) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Transport Parameter 1 (*) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Transport Parameter 2 (*) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Transport Parameter N (*) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +~~~ +{: #transport-parameter-sequence title="Sequence of Transport Parameters"} + +The Sequence Length field contains the length of the sequence of transport +parameters, in bytes. Each transport parameter is encoded as an (identifier, +length, value) tuple, as shown in {{transport-parameter-encoding-fig}}: ~~~ - enum { - original_connection_id(0), - idle_timeout(1), - stateless_reset_token(2), - max_udp_size(3), - initial_max_data(4), - initial_max_stream_data_bidi_local(5), - initial_max_stream_data_bidi_remote(6), - initial_max_stream_data_uni(7), - initial_max_streams_bidi(8), - initial_max_streams_uni(9), - ack_delay_exponent(10), - max_ack_delay(11), - disable_active_migration(12), - preferred_address(13), - active_connection_id_limit(14), - (65535) - } TransportParameterId; - - struct { - TransportParameterId parameter; - opaque value<0..2^16-1>; - } TransportParameter; - - TransportParameter TransportParameters<0..2^16-1>; + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Transport Parameter ID (16) | Transport Param Length (16) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Transport Parameter Value (*) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +>>>>>>> master ~~~ -{: #figure-transport-parameters title="Definition of TransportParameters"} +{: #transport-parameter-encoding-fig title="Transport Parameter Encoding"} -The `extension_data` field of the quic_transport_parameters extension defined in -{{QUIC-TLS}} contains a TransportParameters value. TLS encoding rules are -therefore used to describe the encoding of transport parameters. +The Transport Param Length field contains the length of the Transport +Parameter Value field. QUIC encodes transport parameters into a sequence of bytes, which are then included in the cryptographic handshake. @@ -4410,15 +4565,16 @@ The following transport parameters are defined: original_connection_id (0x0000): : The value of the Destination Connection ID field from the first Initial packet - sent by the client. This transport parameter is only sent by a server. A - server MUST include the original_connection_id transport parameter if it sent - a Retry packet. + sent by the client. This transport parameter is only sent by a server. This + is the same value sent in the "Original Destination Connection ID" field of a + Retry packet (see {{packet-retry}}). A server MUST include the + original_connection_id transport parameter if it sent a Retry packet. -idle_timeout (0x0001): +max_idle_timeout (0x0001): -: The idle timeout is a value in milliseconds that is encoded as an integer; see - ({{idle-timeout}}). If this parameter is absent or zero then the idle - timeout is disabled. +: The max idle timeout is a value in milliseconds that is encoded as an integer; + see ({{idle-timeout}}). Idle timeout is disabled when both endpoints omit + this transport parameter or specify a value of 0. stateless_reset_token (0x0002): @@ -4523,29 +4679,58 @@ preferred_address (0x000d): : The server's preferred address is used to effect a change in server address at the end of the handshake, as described in {{preferred-address}}. The format - of this transport parameter is the PreferredAddress struct shown in - {{fig-preferred-address}}. This transport parameter is only sent by a server. - Servers MAY choose to only send a preferred address of one address family by - sending an all-zero address and port (0.0.0.0:0 or ::.0) for the other family. - IP addresses are encoded in network byte order. + of this transport parameter is shown in {{fig-preferred-address}}. This + transport parameter is only sent by a server. Servers MAY choose to only send + a preferred address of one address family by sending an all-zero address and + port (0.0.0.0:0 or ::.0) for the other family. IP addresses are encoded in + network byte order. The CID Length field contains the length of the + Connection ID field. ~~~ - struct { - opaque ipv4Address[4]; - uint16 ipv4Port; - opaque ipv6Address[16]; - uint16 ipv6Port; - opaque connectionId<0..20>; - opaque statelessResetToken[16]; - } PreferredAddress; + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| IPv4 Address (32) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| IPv4 Port (16) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| | ++ + +| | ++ IPv6 Address (128) + +| | ++ + +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| IPv6 Port (16) | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| CID Length (8)| ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Connection ID (*) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| | ++ + +| | ++ Stateless Reset Token (128) + +| | ++ + +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ {: #fig-preferred-address title="Preferred Address format"} active_connection_id_limit (0x000e): -: The maximum number of connection IDs from the peer that an endpoint is willing - to store. This value includes only connection IDs sent in NEW_CONNECTION_ID - frames. If this parameter is absent, a default of 0 is assumed. +: The active connection ID limit is an integer value specifying the + maximum number of connection IDs from the peer that an endpoint is willing + to store. This value includes the connection ID received during the handshake, + that received in the preferred_address transport parameter, and those received + in NEW_CONNECTION_ID frames. + Unless a zero-length connection ID is being used, the value of the + active_connection_id_limit parameter MUST be no less than 2. If this + transport parameter is absent, a default of 2 is assumed. + When a zero-length connection ID is being used, the active_connection_id_limit + parameter MUST NOT be sent. If present, transport parameters that set initial flow control limits (initial_max_stream_data_bidi_local, initial_max_stream_data_bidi_remote, and @@ -4554,9 +4739,10 @@ initial_max_stream_data_uni) are equivalent to sending a MAX_STREAM_DATA frame immediately after opening. If the transport parameter is absent, streams of that type start with a flow control limit of 0. -A client MUST NOT include an original connection ID, a stateless reset token, or -a preferred address. A server MUST treat receipt of any of these transport -parameters as a connection error of type TRANSPORT_PARAMETER_ERROR. +A client MUST NOT include server-only transport parameters +(original_connection_id, stateless_reset_token, or preferred_address). A server +MUST treat receipt of any of these transport parameters as a connection error of +type TRANSPORT_PARAMETER_ERROR. # Frame Types and Formats {#frame-formats} @@ -4594,11 +4780,12 @@ endpoints send PING frames without coordination can produce an excessive number of packets and poor performance. A connection will time out if no packets are sent or received for a period -longer than the time specified in the idle_timeout transport parameter (see -{{termination}}). However, state in middleboxes might time out earlier than -that. Though REQ-5 in {{?RFC4787}} recommends a 2 minute timeout interval, -experience shows that sending packets every 15 to 30 seconds is necessary to -prevent the majority of middleboxes from losing state for UDP flows. +longer than the time negotiated using the max_idle_timeout transport parameter +(see {{termination}}). However, state in middleboxes might time out earlier +than that. Though REQ-5 in {{?RFC4787}} recommends a 2 minute timeout +interval, experience shows that sending packets every 15 to 30 seconds is +necessary to prevent the majority of middleboxes from losing state for UDP +flows. ## ACK Frames {#frame-ack} @@ -4615,16 +4802,17 @@ QUIC acknowledgements are irrevocable. Once acknowledged, a packet remains acknowledged, even if it does not appear in a future ACK frame. This is unlike TCP SACKs ({{?RFC2018}}). -It is expected that a sender will reuse the same packet number across different -packet number spaces. ACK frames only acknowledge the packet numbers that were -transmitted by the sender in the same packet number space of the packet that the -ACK was received in. +Packets from different packet number spaces can be identified using the same +numeric value. An acknowledgment for a packet needs to indicate both a packet +number and a packet number space. This is accomplished by having each ACK frame +only acknowledge packet numbers in the same space as the packet in which the +ACK frame is contained. Version Negotiation and Retry packets cannot be acknowledged because they do not contain a packet number. Rather than relying on ACK frames, these packets are implicitly acknowledged by the next Initial packet sent by the client. -An ACK frame is as follows: +An ACK frame is shown in {{ack-format}}. ~~~ 0 1 2 3 @@ -4697,7 +4885,7 @@ descending packet number order. The number of Gap and ACK Range values is determined by the ACK Range Count field; one of each value is present for each value in the ACK Range Count field. -ACK Ranges are structured as follows: +ACK Ranges are structured as shown in {{ack-range-format}}. ~~~ 0 1 2 3 @@ -4768,8 +4956,7 @@ subsequent ACK Range using the following formula: ~~~ If any computed packet number is negative, an endpoint MUST generate a -connection error of type FRAME_ENCODING_ERROR indicating an error in an ACK -frame. +connection error of type FRAME_ENCODING_ERROR. ### ECN Counts {#ack-ecn-counts} @@ -4780,7 +4967,7 @@ of ECT(0), ECT(1), or CE in the packet's IP header. ECN Counts are only present when the ACK frame type is 0x03. ECN Counts are only parsed when the ACK frame type is 0x03. There are 3 ECN -counts, as follows: +counts, as shown in {{ecn-count-format}}. ~~~ 0 1 2 3 @@ -4793,6 +4980,7 @@ counts, as follows: | ECN-CE Count (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #ecn-count-format title="ECN Count Format"} The three ECN Counts are: @@ -4823,7 +5011,7 @@ discard any data that it already received on that stream. An endpoint that receives a RESET_STREAM frame for a send-only stream MUST terminate the connection with error STREAM_STATE_ERROR. -The RESET_STREAM frame is as follows: +The RESET_STREAM frame is shown in {{fig-reset-stream}}. ~~~ 0 1 2 3 @@ -4836,6 +5024,7 @@ The RESET_STREAM frame is as follows: | Final Size (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-reset-stream title="RESET_STREAM Frame Format"} RESET_STREAM frames contain the following fields: @@ -4869,7 +5058,7 @@ connection error of type STREAM_STATE_ERROR. An endpoint that receives a STOP_SENDING frame for a receive-only stream MUST terminate the connection with error STREAM_STATE_ERROR. -The STOP_SENDING frame is as follows: +The STOP_SENDING frame is shown in {{fig-stop-sending}}. ~~~ 0 1 2 3 @@ -4880,6 +5069,7 @@ The STOP_SENDING frame is as follows: | Application Error Code (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-stop-sending title="STOP_SENDING Frame Format"} STOP_SENDING frames contain the following fields: @@ -4902,7 +5092,7 @@ are functionally identical to STREAM frames, except that they do not bear a stream identifier; they are not flow controlled; and they do not carry markers for optional offset, optional length, and the end of the stream. -The CRYPTO frame is as follows: +The CRYPTO frame is shown in {{fig-crypto}}. ~~~ 0 1 2 3 @@ -4915,7 +5105,7 @@ The CRYPTO frame is as follows: | Crypto Data (*) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ -{: #crypto-format title="CRYPTO Frame Format"} +{: #fig-crypto title="CRYPTO Frame Format"} CRYPTO frames contain the following fields: @@ -4937,6 +5127,11 @@ There is a separate flow of cryptographic handshake data in each encryption level, each of which starts at an offset of 0. This implies that each encryption level is treated as a separate CRYPTO stream of data. +The largest offset delivered on a stream - the sum of the offset and data +length - cannot exceed 2^62-1. Receipt of a frame that exceeds this limit MUST +be treated as a connection error of type FRAME_ENCODING_ERROR or +CRYPTO_BUFFER_EXCEEDED. + Unlike STREAM frames, which include a Stream ID indicating to which stream the data belongs, the CRYPTO frame carries data for a single stream per encryption level. The stream does not have an explicit end, so CRYPTO frames do not have a @@ -4948,7 +5143,7 @@ FIN bit. A server sends a NEW_TOKEN frame (type=0x07) to provide the client with a token to send in the header of an Initial packet for a future connection. -The NEW_TOKEN frame is as follows: +The NEW_TOKEN frame is shown in {{fig-new-token}}. ~~~ 0 1 2 3 @@ -4959,6 +5154,7 @@ The NEW_TOKEN frame is as follows: | Token (*) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-new-token title="NEW_TOKEN Frame Format"} NEW_TOKEN frames contain the following fields: @@ -4973,8 +5169,9 @@ Token: an empty Token field as a connection error of type FRAME_ENCODING_ERROR. An endpoint might receive multiple NEW_TOKEN frames that contain the same token -value. Endpoints are responsible for discarding duplicate values, which might -be used to link connection attempts; see {{validate-future}}. +value if packets containing the frame are incorrectly determined to be lost. +Endpoints are responsible for discarding duplicate values, which might be used +to link connection attempts; see {{validate-future}}. Clients MUST NOT send NEW_TOKEN frames. Servers MUST treat receipt of a NEW_TOKEN frame as a connection error of type PROTOCOL_VIOLATION. @@ -5005,7 +5202,7 @@ are present in the frame. An endpoint that receives a STREAM frame for a send-only stream MUST terminate the connection with error STREAM_STATE_ERROR. -The STREAM frames are as follows: +The STREAM frames are shown in {{fig-stream}}. ~~~ 0 1 2 3 @@ -5020,7 +5217,7 @@ The STREAM frames are as follows: | Stream Data (*) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ -{: #stream-format title="STREAM Frame Format"} +{: #fig-stream title="STREAM Frame Format"} STREAM frames contain the following fields: @@ -5052,8 +5249,8 @@ the offset of the next byte that would be sent. The first byte in the stream has an offset of 0. The largest offset delivered on a stream - the sum of the offset and data length - cannot exceed 2^62-1, as it is not possible to provide flow control credit for that data. Receipt of a -frame that exceeds this limit will be treated as a connection error of type -FLOW_CONTROL_ERROR. +frame that exceeds this limit MUST be treated as a connection error of type +FRAME_ENCODING_ERROR or FLOW_CONTROL_ERROR. ## MAX_DATA Frame {#frame-max-data} @@ -5061,7 +5258,7 @@ FLOW_CONTROL_ERROR. The MAX_DATA frame (type=0x10) is used in flow control to inform the peer of the maximum amount of data that can be sent on the connection as a whole. -The MAX_DATA frame is as follows: +The MAX_DATA frame is shown in {{fig-max-data}}. ~~~ 0 1 2 3 @@ -5070,6 +5267,7 @@ The MAX_DATA frame is as follows: | Maximum Data (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-max-data title="MAX_DATA Frame Format"} MAX_DATA frames contain the following fields: @@ -5098,7 +5296,7 @@ connection error of type STREAM_STATE_ERROR. An endpoint that receives a MAX_STREAM_DATA frame for a receive-only stream MUST terminate the connection with error STREAM_STATE_ERROR. -The MAX_STREAM_DATA frame is as follows: +The MAX_STREAM_DATA frame is shown in {{fig-max-stream-data}}. ~~~ 0 1 2 3 @@ -5109,6 +5307,7 @@ The MAX_STREAM_DATA frame is as follows: | Maximum Stream Data (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-max-stream-data title="MAX_STREAM_DATA Frame Format"} MAX_STREAM_DATA frames contain the following fields: @@ -5142,7 +5341,7 @@ number of streams of a given type it is permitted to open. A MAX_STREAMS frame with a type of 0x12 applies to bidirectional streams, and a MAX_STREAMS frame with a type of 0x13 applies to unidirectional streams. -The MAX_STREAMS frames are as follows: +The MAX_STREAMS frames are shown in {{fig-max-streams}}; ~~~ 0 1 2 3 @@ -5151,13 +5350,17 @@ The MAX_STREAMS frames are as follows: | Maximum Streams (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-max-streams title="MAX_STREAMS Frame Format"} MAX_STREAMS frames contain the following fields: Maximum Streams: : A count of the cumulative number of streams of the corresponding type that - can be opened over the lifetime of the connection. + can be opened over the lifetime of the connection. Stream IDs cannot exceed + 2^62-1, as it is not possible to encode stream IDs larger than this value. + Receipt of a frame that permits opening of a stream larger than this limit + MUST be treated as a FRAME_ENCODING_ERROR. Loss or reordering can cause a MAX_STREAMS frame to be received which states a lower stream limit than an endpoint has previously received. MAX_STREAMS frames @@ -5181,7 +5384,7 @@ data, but is unable to due to connection-level flow control (see {{flow-control}}). DATA_BLOCKED frames can be used as input to tuning of flow control algorithms (see {{fc-credit}}). -The DATA_BLOCKED frame is as follows: +The DATA_BLOCKED frame is shown in {{fig-data-blocked}}. ~~~ 0 1 2 3 @@ -5190,6 +5393,7 @@ The DATA_BLOCKED frame is as follows: | Data Limit (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-data-blocked title="DATA_BLOCKED Frame Format"} DATA_BLOCKED frames contain the following fields: @@ -5208,7 +5412,7 @@ analogous to DATA_BLOCKED ({{frame-data-blocked}}). An endpoint that receives a STREAM_DATA_BLOCKED frame for a send-only stream MUST terminate the connection with error STREAM_STATE_ERROR. -The STREAM_DATA_BLOCKED frame is as follows: +The STREAM_DATA_BLOCKED frame is shown in {{fig-stream-data-blocked}}. ~~~ 0 1 2 3 @@ -5219,6 +5423,7 @@ The STREAM_DATA_BLOCKED frame is as follows: | Stream Data Limit (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-stream-data-blocked title="STREAM_DATA_BLOCKED Frame Format"} STREAM_DATA_BLOCKED frames contain the following fields: @@ -5243,7 +5448,7 @@ of type 0x17 indicates reaching the unidirectional stream limit. A STREAMS_BLOCKED frame does not open the stream, but informs the peer that a new stream was needed and the stream limit prevented the creation of the stream. -The STREAMS_BLOCKED frames are as follows: +The STREAMS_BLOCKED frames are shown in {{fig-streams-blocked}}. ~~~ 0 1 2 3 @@ -5252,13 +5457,16 @@ The STREAMS_BLOCKED frames are as follows: | Stream Limit (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-streams-blocked title="STREAMS_BLOCKED Frame Format"} STREAMS_BLOCKED frames contain the following fields: Stream Limit: : A variable-length integer indicating the stream limit at the time the frame - was sent. + was sent. Stream IDs cannot exceed 2^62-1, as it is not possible to encode + stream IDs larger than this value. Receipt of a frame that encodes a larger + stream ID MUST be treated as a STREAM_LIMIT_ERROR or a FRAME_ENCODING_ERROR. ## NEW_CONNECTION_ID Frame {#frame-new-connection-id} @@ -5267,7 +5475,7 @@ An endpoint sends a NEW_CONNECTION_ID frame (type=0x18) to provide its peer with alternative connection IDs that can be used to break linkability when migrating connections (see {{migration-linkability}}). -The NEW_CONNECTION_ID frame is as follows: +The NEW_CONNECTION_ID frame is shown in {{fig-new-connection-id}}. ~~~ 0 1 2 3 @@ -5290,6 +5498,7 @@ The NEW_CONNECTION_ID frame is as follows: | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-new-connection-id title="NEW_CONNECTION_ID Frame Format"} NEW_CONNECTION_ID frames contain the following fields: @@ -5307,7 +5516,7 @@ Length: : An 8-bit unsigned integer containing the length of the connection ID. Values less than 1 and greater than 20 are invalid and MUST be treated as a - connection error of type PROTOCOL_VIOLATION. + connection error of type FRAME_ENCODING_ERROR. Connection ID: @@ -5337,21 +5546,22 @@ sequence number, or if a sequence number is used for different connection IDs, the endpoint MAY treat that receipt as a connection error of type PROTOCOL_VIOLATION. -The Retire Prior To field is a request for the peer to retire all connection IDs -with a sequence number less than the specified value. This includes the initial -and preferred_address transport parameter connection IDs. The peer SHOULD -retire the corresponding connection IDs and send the corresponding -RETIRE_CONNECTION_ID frames in a timely manner. - -The Retire Prior To field MUST be less than or equal to the Sequence Number -field. Receiving a value greater than the Sequence Number MUST be treated as a -connection error of type PROTOCOL_VIOLATION. +The Retire Prior To field counts connection IDs established during connection +setup and the preferred_address transport parameter (see {{retiring-cids}}). The +Retire Prior To field MUST be less than or equal to the Sequence Number field. +Receiving a value greater than the Sequence Number MUST be treated as a +connection error of type FRAME_ENCODING_ERROR. Once a sender indicates a Retire Prior To value, smaller values sent in subsequent NEW_CONNECTION_ID frames have no effect. A receiver MUST ignore any Retire Prior To fields that do not increase the largest received Retire Prior To value. +An endpoint that receives a NEW_CONNECTION_ID frame with a sequence number +smaller than the Retire Prior To field of a previously received +NEW_CONNECTION_ID frame MUST immediately send a corresponding +RETIRE_CONNECTION_ID frame that retires the newly received connection ID. + ## RETIRE_CONNECTION_ID Frame {#frame-retire-connection-id} @@ -5365,7 +5575,7 @@ peer using the NEW_CONNECTION_ID frame ({{frame-new-connection-id}}). Retiring a connection ID invalidates the stateless reset token associated with that connection ID. -The RETIRE_CONNECTION_ID frame is as follows: +The RETIRE_CONNECTION_ID frame is shown in {{fig-retire-connection-id}}. ~~~ 0 1 2 3 @@ -5374,6 +5584,7 @@ The RETIRE_CONNECTION_ID frame is as follows: | Sequence Number (i) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-retire-connection-id title="RETIRE_CONNECTION_ID Frame Format"} RETIRE_CONNECTION_ID frames contain the following fields: @@ -5383,13 +5594,13 @@ Sequence Number: {{retiring-cids}}. Receipt of a RETIRE_CONNECTION_ID frame containing a sequence number greater -than any previously sent to the peer MAY be treated as a connection error of +than any previously sent to the peer MUST be treated as a connection error of type PROTOCOL_VIOLATION. The sequence number specified in a RETIRE_CONNECTION_ID frame MUST NOT refer to the Destination Connection ID field of the packet in which the frame is contained. The peer MAY treat this as a connection error of type -PROTOCOL_VIOLATION. +FRAME_ENCODING_ERROR. An endpoint cannot send this frame if it was provided with a zero-length connection ID by its peer. An endpoint that provides a zero-length connection @@ -5402,7 +5613,7 @@ type PROTOCOL_VIOLATION. Endpoints can use PATH_CHALLENGE frames (type=0x1a) to check reachability to the peer and for path validation during connection migration. -The PATH_CHALLENGE frames are as follows: +The PATH_CHALLENGE frame is shown in {{fig-path-challenge}}. ~~~ 0 1 2 3 @@ -5413,6 +5624,7 @@ The PATH_CHALLENGE frames are as follows: | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-path-challenge title="PATH_CHALLENGE Frame Format"} PATH_CHALLENGE frames contain the following fields: @@ -5450,7 +5662,7 @@ is used to signal an error with the application that uses QUIC. If there are open streams that haven't been explicitly closed, they are implicitly closed when the connection is closed. -The CONNECTION_CLOSE frames are as follows: +The CONNECTION_CLOSE frames are shown in {{fig-connection-close}}. ~~~ 0 1 2 3 @@ -5465,6 +5677,7 @@ The CONNECTION_CLOSE frames are as follows: | Reason Phrase (*) ... +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~~~ +{: #fig-connection-close title="CONNECTION_CLOSE Frame Format"} CONNECTION_CLOSE frames contain the following fields: @@ -5495,6 +5708,24 @@ Reason Phrase: zero length if the sender chooses to not give details beyond the Error Code. This SHOULD be a UTF-8 encoded string {{!RFC3629}}. +The application-specific variant of CONNECTION_CLOSE (type 0x1d) can only be +sent using an 1-RTT packet ({{QUIC-TLS}}, Section 4). When an application +wishes to abandon a connection during the handshake, an endpoint can send a +CONNECTION_CLOSE frame (type 0x1c) with an error code of 0x15a ("user_canceled" +alert; see {{?TLS13}}) in an Initial or a Handshake packet. + + +## HANDSHAKE_DONE frame {#frame-handshake-done} + +The server uses the HANDSHAKE_DONE frame (type=0x1e) to signal confirmation of +the handshake to the client. The HANDSHAKE_DONE frame contains no additional +fields. + +This frame can only be sent by the server. Servers MUST NOT send a +HANDSHAKE_DONE frame before completing the handshake. A server MUST treat +receipt of a HANDSHAKE_DONE frame as a connection error of type +PROTOCOL_VIOLATION. + ## Extension Frames @@ -5508,6 +5739,12 @@ that a peer is able to understand the frame. An endpoint can use a transport parameter to signal its willingness to receive one or more extension frame types with the one transport parameter. +Extensions that modify or replace core protocol functionality (including frame +types) will be difficult to combine with other extensions which modify or +replace the same functionality unless the behavior of the combination is +explicitly defined. Such extensions SHOULD define their interaction with +previously-defined extensions modifying the same protocol components. + Extension frames MUST be congestion controlled and MUST cause an ACK frame to be sent. The exception is extension frames that replace or supplement the ACK frame. Extension frames are not included in flow control unless specified @@ -5574,11 +5811,19 @@ TRANSPORT_PARAMETER_ERROR (0x8): an invalid value, was absent even though it is mandatory, was present though it is forbidden, or is otherwise in error. +CONNECTION_ID_LIMIT_ERROR (0x9): + +: The number of connection IDs provided by the peer exceeds the advertised + active_connection_id_limit. + PROTOCOL_VIOLATION (0xA): : An endpoint detected an error with protocol compliance that was not covered by more specific error codes. +INVALID_TOKEN (0xB): +: A server received a Retry Token in a client Initial that is invalid. + CRYPTO_BUFFER_EXCEEDED (0xD): : An endpoint has received more data in CRYPTO frames than it can buffer. @@ -5612,7 +5857,7 @@ the CONNECTION_CLOSE frame with a type of 0x1d ({{frame-connection-close}}). # Security Considerations -## Handshake Denial of Service +## Handshake Denial of Service {#handshake-dos} As an encrypted and authenticated transport QUIC provides a range of protections against denial of service. Once the cryptographic handshake is complete, QUIC @@ -5629,15 +5874,19 @@ During the creation of a connection, QUIC only provides protection against attack from off the network path. All QUIC packets contain proof that the recipient saw a preceding packet from its peer. -The first mechanism used is the source and destination connection IDs, which are -required to match those set by a peer. Except for an Initial and stateless -reset packets, an endpoint only accepts packets that include a destination -connection that matches a connection ID the endpoint previously chose. This is -the only protection offered for Version Negotiation packets. +Addresses cannot change during the handshake, so endpoints can discard packets +that are received on a different network path. + +The Source and Destination Connection ID fields are the primary means of +protection against off-path attack during the handshake. These are required to +match those set by a peer. Except for an Initial and stateless reset packets, +an endpoint only accepts packets that include a Destination Connection ID field +that matches a value the endpoint previously chose. This is the only protection +offered for Version Negotiation packets. -The destination connection ID in an Initial packet is selected by a client to be -unpredictable, which serves an additional purpose. The packets that carry the -cryptographic handshake are protected with a key that is derived from this +The Destination Connection ID field in an Initial packet is selected by a client +to be unpredictable, which serves an additional purpose. The packets that carry +the cryptographic handshake are protected with a key that is derived from this connection ID and salt specific to the QUIC version. This allows endpoints to use the same process for authenticating packets that they receive as they use after the cryptographic handshake completes. Packets that cannot be @@ -5804,13 +6053,14 @@ be influenced by an attacker. ## Version Downgrade {#version-downgrade} -This document defines QUIC Version Negotiation packets {{version-negotiation}}, -which can be used to negotiate the QUIC version used between two endpoints. -However, this document does not specify how this negotiation will be performed -between this version and subsequent future versions. In particular, Version -Negotiation packets do not contain any mechanism to prevent version downgrade -attacks. Future versions of QUIC that use Version Negotiation packets MUST -define a mechanism that is robust against version downgrade attacks. +This document defines QUIC Version Negotiation packets in +{{version-negotiation}}, which can be used to negotiate the QUIC version used +between two endpoints. However, this document does not specify how this +negotiation will be performed between this version and subsequent future +versions. In particular, Version Negotiation packets do not contain any +mechanism to prevent version downgrade attacks. Future versions of QUIC that +use Version Negotiation packets MUST define a mechanism that is robust against +version downgrade attacks. ## Targeted Attacks by Routing @@ -5823,45 +6073,459 @@ decisions are made independently of client-selected values; a Source Connection ID can be selected to route later packets to the same server. -# IANA Considerations +## Overview of Security Properties {#security-properties} -## QUIC Transport Parameter Registry {#iana-transport-parameters} +A complete security analysis of QUIC is outside the scope of this document. +This section provides an informal description of the desired security properties +as an aid to implementors and to help guide protocol analysis. -IANA \[SHALL add/has added] a registry for "QUIC Transport Parameters" under a -"QUIC Protocol" heading. +QUIC assumes the threat model described in {{?SEC-CONS=RFC3552}} and provides +protections against many of the attacks that arise from that model. -The "QUIC Transport Parameters" registry governs a 16-bit space. This space is -split into two spaces that are governed by different policies. Values with the -first byte in the range 0x00 to 0xfe (in hexadecimal) are assigned via the -Specification Required policy {{!RFC8126}}. Values with the first byte 0xff are -reserved for Private Use {{!RFC8126}}. +For this purpose, attacks are divided into passive and active attacks. Passive +attackers have the capability to read packets from the network, while active +attackers also have the capability to write packets into the network. However, +a passive attack may involve an attacker with the ability to cause a routing +change or other modification in the path taken by packets that comprise a +connection. -Registrations MUST include the following fields: +Attackers are additionally categorized as either on-path attackers or off-path +attackers; see Section 3.5 of {{?SEC-CONS}}. An on-path attacker can read, +modify, or remove any packet it observes such that it no longer reaches its +destination, while an off-path attacker observes the packets, but cannot prevent +the original packet from reaching its intended destination. An off-path +attacker can also transmit arbitrary packets. -Value: +Properties of the handshake, protected packets, and connection migration are +considered separately. -: The numeric value of the assignment (registrations will be between 0x0000 and - 0xfeff). -Parameter Name: +### Handshake {#handshake-properties} -: A short mnemonic for the parameter. +The QUIC handshake incorporates the TLS 1.3 handshake and enjoys the +cryptographic properties described in Appendix E.1 of {{?TLS13=RFC8446}}. -Specification: +In addition to those properties, the handshake is intended to provide some +defense against DoS attacks on the handshake, as described below. + + +#### Anti-Amplification + +Address validation ({{address-validation}}) is used to verify that an entity +that claims a given address is able to receive packets at that address. Address +validation limits amplification attack targets to addresses for which an +attacker is either on-path or off-path. + +Prior to validation, endpoints are limited in what they are able to send. +During the handshake, a server cannot send more than three times the data it +receives; clients that initiate new connections or migrate to a new network +path are limited. + + +#### Server-Side DoS + +Computing the server's first flight for a full handshake is potentially +expensive, requiring both a signature and a key exchange computation. In order +to prevent computational DoS attacks, the Retry packet provides a cheap token +exchange mechanism which allows servers to validate a client's IP address prior +to doing any expensive computations at the cost of a single round trip. After a +successful handshake, servers can issue new tokens to a client which will allow +new connection establishment without incurring this cost. + + +#### On-Path Handshake Termination + +An on-path or off-path attacker can force a handshake to fail by replacing or +racing Initial packets. Once valid Initial packets have been exchanged, +subsequent Handshake packets are protected with the handshake keys and an +on-path attacker cannot force handshake failure other than by dropping packets +to cause endpoints to abandon the attempt. + +An on-path attacker can also replace the addresses of packets on either side and +therefore cause the client or server to have an incorrect view of the remote +addresses. Such an attack is indistinguishable from the functions performed by a +NAT. + + +#### Parameter Negotiation +The entire handshake is cryptographically protected, with the Initial packets +being encrypted with per-version keys and the Handshake and later packets being +encrypted with keys derived from the TLS key exchange. Further, parameter +negotiation is folded into the TLS transcript and thus provides the same +security guarantees as ordinary TLS negotiation. Thus, an attacker can observe +the client's transport parameters (as long as it knows the version-specific +salt) but cannot observe the server's transport parameters and cannot influence +parameter negotiation. + +Connection IDs are unencrypted but integrity protected in all packets. + +This version of QUIC does not incorporate a version negotiation mechanism; +implementations of incompatible versions will simply fail to establish a +connection. + + +### Protected Packets {#protected-packet-properties} + +Packet protection ({{packet-protected}}) provides authentication and encryption +of all packets except Version Negotiation packets, though Initial and Retry +packets have limited encryption and authentication based on version-specific +keys; see {{QUIC-TLS}} for more details. This section considers passive and +active attacks against protected packets. + +Both on-path and off-path attackers can mount a passive attack in which they +save observed packets for an offline attack against packet protection at a +future time; this is true for any observer of any packet on any network. + +A blind attacker, one who injects packets without being able to observe valid +packets for a connection, is unlikely to be successful, since packet protection +ensures that valid packets are only generated by endpoints which possess the +key material established during the handshake; see {{handshake}} and +{{handshake-properties}}. Similarly, any active attacker that observes packets +and attempts to insert new data or modify existing data in those packets should +not be able to generate packets deemed valid by the receiving endpoint. + +A spoofing attack, in which an active attacker rewrites unprotected parts of a +packet that it forwards or injects, such as the source or destination +address, is only effective if the attacker can forward packets to the original +endpoint. Packet protection ensures that the packet payloads can only be +processed by the endpoints that completed the handshake, and invalid +packets are ignored by those endpoints. + +An attacker can also modify the boundaries between packets and UDP datagrams, +causing multiple packets to be coalesced into a single datagram, or splitting +coalesced packets into multiple datagrams. Aside from datagrams containing +Initial packets, which require padding, modification of how packets are +arranged in datagrams has no functional effect on a connection, although it +might change some performance characteristics. + + +### Connection Migration {#migration-properties} + +Connection Migration ({{migration}}) provides endpoints with the ability to +transition between IP addresses and ports on multiple paths, using one path at a +time for transmission and receipt of non-probing frames. Path validation +({{migrate-validate}}) establishes that a peer is both willing and able +to receive packets sent on a particular path. This helps reduce the effects of +address spoofing by limiting the number of packets sent to a spoofed address. + +This section describes the intended security properties of connection migration +when under various types of DoS attacks. + + +#### On-Path Active Attacks + +An attacker that can cause a packet it observes to no longer reach its intended +destination is considered an on-path attacker. When an attacker is present +between a client and server, endpoints are required to send packets through the +attacker to establish connectivity on a given path. + +An on-path attacker can: + +- Inspect packets +- Modify IP and UDP packet headers +- Inject new packets +- Delay packets +- Reorder packets +- Drop packets +- Split and merge datagrams along packet boundaries + +An on-path attacker cannot: + +- Modify an authenticated portion of a packet and cause the recipient to accept + that packet + +An on-path attacker has the opportunity to modify the packets that it observes, +however any modifications to an authenticated portion of a packet will cause it +to be dropped by the receiving endpoint as invalid, as packet payloads are both +authenticated and encrypted. + +In the presence of an on-path attacker, QUIC aims to provide the following +properties: + +1. An on-path attacker can prevent use of a path for a connection, causing + it to fail if it cannot use a different path that does not contain the + attacker. This can be achieved by dropping all packets, modifying them so + that they fail to decrypt, or other methods. + +2. An on-path attacker can prevent migration to a new path for which the + attacker is also on-path by causing path validation to fail on the new path. + +3. An on-path attacker cannot prevent a client from migrating to a path for + which the attacker is not on-path. + +4. An on-path attacker can reduce the throughput of a connection by delaying + packets or dropping them. + +5. An on-path attacker cannot cause an endpoint to accept a packet for which it + has modified an authenticated portion of that packet. + + +#### Off-Path Active Attacks + +An off-path attacker is not directly on the path between a client and server, +but could be able to obtain copies of some or all packets sent between the +client and the server. It is also able to send copies of those packets to +either endpoint. + +An off-path attacker can: + +- Inspect packets +- Inject new packets +- Reorder injected packets + +An off-path attacker cannot: + +- Modify any part of a packet +- Delay packets +- Drop packets +- Reorder original packets + +An off-path attacker can modify packets that it has observed and inject them +back into the network, potentially with spoofed source and destination +addresses. + +For the purposes of this discussion, it is assumed that an off-path attacker +has the ability to observe, modify, and re-inject a packet into the network +that will reach the destination endpoint prior to the arrival of the original +packet observed by the attacker. In other words, an attacker has the ability to +consistently "win" a race with the legitimate packets between the endpoints, +potentially causing the original packet to be ignored by the recipient. + +It is also assumed that an attacker has the resources necessary to affect NAT +state, potentially both causing an endpoint to lose its NAT binding, and an +attacker to obtain the same port for use with its traffic. + +In the presence of an off-path attacker, QUIC aims to provide the following +properties: + +1. An off-path attacker can race packets and attempt to become a "limited" + on-path attacker. + +2. An off-path attacker can cause path validation to succeed for forwarded + packets with the source address listed as the off-path attacker as long as + it can provide improved connectivity between the client and the server. + +3. An off-path attacker cannot cause a connection to close once the handshake + has completed. + +4. An off-path attacker cannot cause migration to a new path to fail if it + cannot observe the new path. + +5. An off-path attacker can become a limited on-path attacker during migration + to a new path for which it is also an off-path attacker. + +6. An off-path attacker can become a limited on-path attacker by affecting + shared NAT state such that it sends packets to the server from the same IP + address and port that the client originally used. + + +#### Limited On-Path Active Attacks + +A limited on-path attacker is an off-path attacker that has offered improved +routing of packets by duplicating and forwarding original packets between the +server and the client, causing those packets to arrive before the original +copies such that the original packets are dropped by the destination endpoint. + +A limited on-path attacker differs from an on-path attacker in that it is not on +the original path between endpoints, and therefore the original packets sent by +an endpoint are still reaching their destination. This means that a future +failure to route copied packets to the destination faster than their original +path will not prevent the original packets from reaching the destination. + +A limited on-path attacker can: + +- Inspect packets +- Inject new packets +- Modify unencrypted packet headers +- Reorder packets + +A limited on-path attacker cannot: + +- Delay packets so that they arrive later than packets sent on the original path +- Drop packets +- Modify the authenticated and encrypted portion of a packet and cause the + recipient to accept that packet + +A limited on-path attacker can only delay packets up to the point that the +original packets arrive before the duplicate packets, meaning that it cannot +offer routing with worse latency than the original path. If a limited on-path +attacker drops packets, the original copy will still arrive at the destination +endpoint. + +In the presence of a limited on-path attacker, QUIC aims to provide the +following properties: + +1. A limited on-path attacker cannot cause a connection to close once the + handshake has completed. + +2. A limited on-path attacker cannot cause an idle connection to close if the + client is first to resume activity. + +3. A limited on-path attacker can cause an idle connection to be deemed lost if + the server is the first to resume activity. + +Note that these guarantees are the same guarantees provided for any NAT, for the +same reasons. + + +# IANA Considerations {#iana} + +This document establishes several registries for the management of codepoints in +QUIC. These registries operate on a common set of policies as defined in +{{iana-policy}}. + + +## Registration Policies for QUIC Registries {#iana-policy} + +All QUIC registries allow for both provisional and permanent registration of +codepoints. This section documents policies that are common to these +registries. + + +### Provisional Registrations {#iana-provisional} + +Provisional registration of codepoints are intended to allow for private use and +experimentation with extensions to QUIC. Provisional registrations only require +the inclusion of the codepoint value and contact information. However, +provisional registrations could be reclaimed and reassigned for another purpose. + +Provisional registrations require Expert Review, as defined in Section 4.5 of +{{!RFC8126}}. Designated expert(s) are advised that only registrations for an +excessive proportion of remaining codepoint space or the very first unassigned +value (see {{iana-random}}) can be rejected. + +Provisional registrations will include a date field that indicates when the +registration was last updated. A request to update the date on any provisional +registration can be made without review from the designated expert(s). + +All QUIC registries include the following fields to support provisional +registration: + +Value: +: The assigned codepoint. + +Status: +: "Permanent" or "Provisional". + +Specification: : A reference to a publicly available specification for the value. -The nominated expert(s) verify that a specification exists and is readily -accessible. Expert(s) are encouraged to be biased towards approving -registrations unless they are abusive, frivolous, or actively harmful (not -merely aesthetically displeasing, or architecturally dubious). +Date: +: The date of last update to the registration. + +Contact: +: Contact details for the registrant. + +Notes: +: Supplementary notes about the registration. + +Provisional registrations MAY omit the Specification and Notes fields, plus any +additional fields that might be required for a permanent registration. The Date +field is not required as part of requesting a registration as it is set to the +date the registration is created or updated. + + +### Selecting Codepoints {#iana-random} + +New uses of codepoints from QUIC registries SHOULD use a randomly selected +codepoint that excludes both existing allocations and the first unallocated +codepoint in the selected space. Requests for multiple codepoints MAY use a +contiguous range. This minimizes the risk that differing semantics are +attributed to the same codepoint by different implementations. Use of the first +codepoint in a range is intended for use by specifications that are developed +through the standards process {{?STD=RFC2026}} and its allocation MUST be +negotiated with IANA before use. + +For codepoints that are encoded in variable-length integers +({{integer-encoding}}), such as frame types, codepoints that encode to four or +eight bytes (that is, values 2^14 and above) SHOULD be used unless the usage is +especially sensitive to having a longer encoding. + +Applications to register codepoints in QUIC registries MAY include a codepoint +as part of the registration. IANA MUST allocate the selected codepoint unless +that codepoint is already assigned or the codepoint is the first unallocated +codepoint in the registry. + + +### Reclaiming Provisional Codepoints + +A request might be made to remove an unused provisional registration from the +registry to reclaim space in a registry, or portion of the registry (such as the +64-16383 range for codepoints that use variable-length encodings). This SHOULD +be done only for the codepoints with the earliest recorded date and entries that +have been updated less than a year prior SHOULD NOT be reclaimed. + +A request to remove a codepoint MUST be reviewed by the designated expert(s). +The expert(s) MUST attempt to determine whether the codepoint is still in use. +Experts are advised to contact the listed contacts for the registration, plus as +wide a set of protocol implementers as possible in order to determine whether +any use of the codepoint is known. The expert(s) are advised to allow at least +four weeks for responses. + +If any use of the codepoints is identified by this search or a request to update +the registration is made, the codepoint MUST NOT be reclaimed. Instead, the +date on the registration is updated. A note might be added for the registration +recording relevant information that was learned. + +If no use of the codepoint was identified and no request was made to update the +registration, the codepoint MAY be removed from the registry. + +This process also applies to requests to change a provisional registration into +a permanent registration, except that the goal is not to determine whether there +is no use of the codepoint, but to determine that the registration is an +accurate representation of any deployed usage. + + +### Permanent Registrations {#iana-permanent} + +Permanent registrations in QUIC registries use the Specification Required policy +{{!RFC8126}}, unless otherwise specified. The designated expert(s) verify that +a specification exists and is readily accessible. Expert(s) are encouraged to +be biased towards approving registrations unless they are abusive, frivolous, or +actively harmful (not merely aesthetically displeasing, or architecturally +dubious). The creation of a registry MAY specify additional constraints on +permanent registrations. + +The creation of a registries MAY identify a range of codepoints where +registrations are governed by a different registration policy. For instance, +the registries for 62-bit codepoints in this document have stricter policies for +codepoints in the range from 0 to 63. + +Any stricter requirements for permanent registrations do not prevent provisional +registrations for affected codepoints. For instance, a provisional registration +for a frame type {{iana-frames}} of 61 could be requested. + +All registrations made by Standards Track publications MUST be permanent. + +All registrations in this document are assigned a permanent status and list as +contact both the IESG (ietf@ietf.org) and the QUIC working group +(quic@ietf.org). + + +## QUIC Transport Parameter Registry {#iana-transport-parameters} + +IANA \[SHALL add/has added] a registry for "QUIC Transport Parameters" under a +"QUIC" heading. + +The "QUIC Transport Parameters" registry governs a 16-bit space. This registry +follows the registration policy from {{iana-policy}}. Permanent registrations +in this registry are assigned using the Specification Required policy +{{!RFC8126}}. + +In addition to the fields in {{iana-provisional}}, permanent registrations in +this registry MUST include the following fields: + +Parameter Name: + +: A short mnemonic for the parameter. The initial contents of this registry are shown in {{iana-tp-table}}. | Value | Parameter Name | Specification | |:-------|:----------------------------|:------------------------------------| | 0x0000 | original_connection_id | {{transport-parameter-definitions}} | -| 0x0001 | idle_timeout | {{transport-parameter-definitions}} | +| 0x0001 | max_idle_timeout | {{transport-parameter-definitions}} | | 0x0002 | stateless_reset_token | {{transport-parameter-definitions}} | | 0x0003 | max_udp_size | {{transport-parameter-definitions}} | | 0x0004 | initial_max_data | {{transport-parameter-definitions}} | @@ -5878,47 +6542,34 @@ The initial contents of this registry are shown in {{iana-tp-table}}. {: #iana-tp-table title="Initial QUIC Transport Parameters Entries"} Additionally, each value of the format `31 * N + 27` for integer values of N -(that is, `27`, `58`, `89`, ...) MUST NOT be assigned by IANA. +(that is, `27`, `58`, `89`, ...) are reserved and MUST NOT be assigned by IANA. ## QUIC Frame Type Registry {#iana-frames} IANA \[SHALL add/has added] a registry for "QUIC Frame Types" under a -"QUIC Protocol" heading. +"QUIC" heading. -The "QUIC Frame Types" registry governs a 62-bit space. This space is split -into three spaces that are governed by different policies. Values between 0x00 -and 0x3f (in hexadecimal) are assigned via the Standards Action or IESG Review -policies {{!RFC8126}}. Values from 0x40 to 0x3fff operate on the Specification -Required policy {{!RFC8126}}. All other values are assigned to Private Use -{{!RFC8126}}. - -Registrations MUST include the following fields: +The "QUIC Frame Types" registry governs a 62-bit space. This registry follows +the registration policy from {{iana-policy}}. Permanent registrations in this +registry are assigned using the Specification Required policy {{!RFC8126}}, +except for values between 0x00 and 0x3f (in hexadecimal; inclusive), which are +assigned using Standards Action or IESG Approval as defined in Section 4.9 and +4.10 of {{!RFC8126}}. -Value: - -: The numeric value of the assignment (registrations will be between 0x00 and - 0x3fff). A range of values MAY be assigned. +In addition to the fields in {{iana-provisional}}, permanent registrations in +this registry MUST include the following fields: Frame Name: : A short mnemonic for the frame type. -Specification: - -: A reference to a publicly available specification for the value. - -The nominated expert(s) verify that a specification exists and is readily -accessible. Specifications for new registrations need to describe the means by -which an endpoint might determine that it can send the identified type of frame. -An accompanying transport parameter registration (see -{{iana-transport-parameters}}) is expected for most registrations. The -specification needs to describe the format and assigned semantics of any fields -in the frame. - -Expert(s) are encouraged to be biased towards approving registrations unless -they are abusive, frivolous, or actively harmful (not merely aesthetically -displeasing, or architecturally dubious). +In addition to the advice in {{iana-policy}}, specifications for new permanent +registrations SHOULD describe the means by which an endpoint might determine +that it can send the identified type of frame. An accompanying transport +parameter registration (see {{iana-transport-parameters}}) is expected for most +registrations. Specifications for permanent registrations also needs to +describe the format and assigned semantics of any fields in the frame. The initial contents of this registry are tabulated in {{frame-types}}. @@ -5926,21 +6577,17 @@ The initial contents of this registry are tabulated in {{frame-types}}. ## QUIC Transport Error Codes Registry {#iana-error-codes} IANA \[SHALL add/has added] a registry for "QUIC Transport Error Codes" under a -"QUIC Protocol" heading. +"QUIC" heading. The "QUIC Transport Error Codes" registry governs a 62-bit space. This space is -split into three spaces that are governed by different policies. Values between -0x00 and 0x3f (in hexadecimal) are assigned via the Standards Action or IESG -Review policies {{!RFC8126}}. Values from 0x40 to 0x3fff operate on the -Specification Required policy {{!RFC8126}}. All other values are assigned to -Private Use {{!RFC8126}}. +split into three spaces that are governed by different policies. Permanent +registrations in this registry are assigned using the Specification Required +policy {{!RFC8126}}, except for values between 0x00 and 0x3f (in hexadecimal; +inclusive), which are assigned using Standards Action or IESG Approval as +defined in Section 4.9 and 4.10 of {{!RFC8126}}. -Registrations MUST include the following fields: - -Value: - -: The numeric value of the assignment (registrations will be between 0x0000 and - 0x3fff). +In addition to the fields in {{iana-provisional}}, permanent registrations in +this registry MUST include the following fields: Code: @@ -5951,15 +6598,6 @@ Description: : A brief description of the error code semantics, which MAY be a summary if a specification reference is provided. -Specification: - -: A reference to a publicly available specification for the value. - -The nominated expert(s) verify that a specification exists and is readily -accessible. Expert(s) are encouraged to be biased towards approving -registrations unless they are abusive, frivolous, or actively harmful (not -merely aesthetically displeasing, or architecturally dubious). - The initial contents of this registry are shown in {{iana-error-table}}. | Value | Error | Description | Specification | @@ -5973,7 +6611,9 @@ The initial contents of this registry are shown in {{iana-error-table}}. | 0x6 | FINAL_SIZE_ERROR | Change to final size | {{error-codes}} | | 0x7 | FRAME_ENCODING_ERROR | Frame encoding error | {{error-codes}} | | 0x8 | TRANSPORT_PARAMETER_ERROR | Error in transport parameters | {{error-codes}} | +| 0x9 | CONNECTION_ID_LIMIT_ERROR | Too many connection IDs received | {{error-codes}} | | 0xA | PROTOCOL_VIOLATION | Generic protocol violation | {{error-codes}} | +| 0xB | INVALID_TOKEN | Invalid Token Received | {{error-codes}} | | 0xD | CRYPTO_BUFFER_EXCEEDED | CRYPTO data buffer overflowed | {{error-codes}} | {: #iana-error-table title="Initial QUIC Transport Error Codes Entries"} @@ -5982,8 +6622,8 @@ The initial contents of this registry are shown in {{iana-error-table}}. # Sample Packet Number Decoding Algorithm {#sample-packet-number-decoding} -The following pseudo-code shows how an implementation can decode packet -numbers after header protection has been removed. +The pseudo-code in {{alg-decode-pn}} shows how an implementation can decode +packet numbers after header protection has been removed. ~~~ DecodePacketNumber(largest_pn, truncated_pn, pn_nbits): @@ -6001,16 +6641,60 @@ DecodePacketNumber(largest_pn, truncated_pn, pn_nbits): // // The following code calculates a candidate value and // makes sure it's within the packet number window. + // Note the extra checks to prevent overflow and underflow. candidate_pn = (expected_pn & ~pn_mask) | truncated_pn - if candidate_pn <= expected_pn - pn_hwin: + if candidate_pn <= expected_pn - pn_hwin and + candidate_pn < (1 << 62) - pn_win: return candidate_pn + pn_win - // Note the extra check for underflow when candidate_pn - // is near zero. if candidate_pn > expected_pn + pn_hwin and - candidate_pn > pn_win: + candidate_pn >= pn_win: return candidate_pn - pn_win return candidate_pn ~~~ +{: #alg-decode-pn title="Sample Packet Number Decoding Algorithm"} + + +# Sample ECN Validation Algorithm {#ecn-alg} + +Each time an endpoint commences sending on a new network path, it determines +whether the path supports ECN; see {{ecn}}. If the path supports ECN, the goal +is to use ECN. Endpoints might also periodically reassess a path that was +determined to not support ECN. + +This section describes one method for testing new paths. This algorithm is +intended to show how a path might be tested for ECN support. Endpoints can +implement different methods. + +The path is assigned an ECN state that is one of "testing", "unknown", "failed", +or "capable". On paths with a "testing" or "capable" state the endpoint sends +packets with an ECT marking, by default ECT(0); otherwise, the endpoint sends +unmarked packets. + +To start testing a path, the ECN state is set to "testing" and existing ECN +counts are remembered as a baseline. + +The testing period runs for a number of packets or round-trip times, as +determined by the endpoint. The goal is not to limit the duration of the +testing period, but to ensure that enough marked packets are sent for received +ECN counts to provide a clear indication of how the path treats marked packets. +{{ecn-ack}} suggests limiting this to 10 packets or 3 round-trip times. + +After the testing period ends, the ECN state for the path becomes "unknown". +From the "unknown" state, successful validation of the ECN counts an ACK frame +(see {{ecn-ack}}) causes the ECN state for the path to become "capable", unless +no marked packet has been acknowledged. + +If validation of ECN counts fails at any time, the ECN state for the affected +path becomes "failed". An endpoint can also mark the ECN state for a path as +"failed" if marked packets are all declared lost or if they are all CE marked. + +Following this algorithm ensures that ECN is rarely disabled for paths that +properly support ECN. Any path that incorrectly modifies markings will cause +ECN to be disabled. For those rare cases where marked packets are discarded by +the path, the short duration of the testing period limits the number of losses +incurred. + + # Change Log @@ -6019,6 +6703,52 @@ DecodePacketNumber(largest_pn, truncated_pn, pn_nbits): Issue and pull request numbers are listed with a leading octothorp. +## Since draft-ietf-quic-transport-24 + +- Added HANDSHAKE_DONE to signal handshake confirmation (#2863, #3142, #3145) +- Add integrity check to Retry packets (#3014, #3274, #3120) +- Specify handling of reordered NEW_CONNECTION_ID frames (#3194, #3202) +- Require checking of sequence numbers in RETIRE_CONNECTION_ID (#3037, #3036) +- active_connection_id_limit is enforced (#3193, #3197, #3200, #3201) +- Correct overflow in packet number decode algorithm (#3187, #3188) +- Allow use of CRYPTO_BUFFER_EXCEEDED for CRYPTO frame errors (#3258, #3186) +- Define applicability and scope of NEW_TOKEN (#3150, #3152, #3155, #3156) +- Tokens from Retry and NEW_TOKEN must be differentiated (#3127, #3128) +- Allow CONNECTION_CLOSE in response to invalid token (#3168, #3107) +- Treat an invalid CONNECTION_CLOSE as an invalid frame (#2475, #3230, #3231) +- Throttle when sending CONNECTION_CLOSE after discarding state (#3095, #3157) +- Application-variant of CONNECTION_CLOSE can only be sent in 0-RTT or 1-RTT + packets (#3158, #3164) +- Advise sending while blocked to avoid idle timeout (#2744, #3266) +- Define error codes for invalid frames (#3027, #3042) +- Idle timeout is symmetric (#2602, #3099) +- Prohibit IP fragmentation (#3243, #3280) +- Define the use of provisional registration for all registries (#3109, #3020, + #3102, #3170) +- Packets on one path must not adjust values for a different path (#2909, + #3139) + +## Since draft-ietf-quic-transport-23 + +- Allow ClientHello to span multiple packets (#2928, #3045) +- Client Initial size constraints apply to UDP datagram payload (#3053, #3051) +- Stateless reset changes (#2152, #2993) + - tokens need to be compared in constant time + - detection uses UDP datagrams, not packets + - tokens cannot be reused (#2785, #2968) +- Clearer rules for sharing of UDP ports and use of connection IDs when doing so + (#2844, #2851) +- A new connection ID is necessary when responding to migration (#2778, #2969) +- Stronger requirements for connection ID retirement (#3046, #3096) +- NEW_TOKEN cannot be empty (#2978, #2977) +- PING can be sent at any encryption level (#3034, #3035) +- CONNECTION_CLOSE is not ack-eliciting (#3097, #3098) +- Frame encoding error conditions updated (#3027, #3042) +- Non-ack-eliciting packets cannot be sent in response to non-ack-eliciting + packets (#3100, #3104) +- Servers have to change connection IDs in Retry (#2837, #3147) + + ## Since draft-ietf-quic-transport-22 - Rules for preventing correlation by connection ID tightened (#2084, #2929) @@ -6455,28 +7185,43 @@ Substantial editorial reorganization; no technical changes. - Moved Contributors and Acknowledgments to appendices -# Acknowledgments -{:numbered="false"} - -Special thanks are due to the following for helping shape pre-IETF QUIC and its -deployment: Chris Bentzel, Misha Efimov, Roberto Peon, Alistair Riddoch, -Siddharth Vijayakrishnan, and Assar Westerlund. - -This document has benefited immensely from various private discussions and -public ones on the quic@ietf.org and proto-quic@chromium.org mailing lists. Our -thanks to all. - - # Contributors {:numbered="false"} -The original authors of this specification were Ryan Hamilton, Jana Iyengar, Ian -Swett, and Alyssa Wilk. - The original design and rationale behind this protocol draw significantly from -work by Jim Roskind {{EARLY-DESIGN}}. In alphabetical order, the contributors to -the pre-IETF QUIC project at Google are: Britt Cyr, Jeremy Dorfman, Ryan -Hamilton, Jana Iyengar, Fedor Kouranov, Charles Krasic, Jo Kulik, Adam Langley, -Jim Roskind, Robbie Shade, Satyam Shekhar, Cherie Shi, Ian Swett, Raman Tenneti, -Victor Vasiliev, Antonio Vicente, Patrik Westin, Alyssa Wilk, Dale Worley, Fan -Yang, Dan Zhang, Daniel Ziegler. +work by Jim Roskind {{EARLY-DESIGN}}. + +The IETF QUIC Working Group received an enormous amount of support from many +people. The following people provided substantive contributions to this +document: +Alessandro Ghedini, +Alyssa Wilk, +Antoine Delignat-Lavaud, +Brian Trammell, +Christian Huitema, +Colin Perkins, +David Schinazi, +Dmitri Tikhonov, +Eric Kinnear, +Eric Rescorla, +Gorry Fairhurst, +Ian Swett, +Igor Lubashev, , +Lucas Pardue, +Magnus Westerlund, +Marten Seemann, +Martin Duke, +Mike Bishop, , , +Nick Banks, +Nick Harper, +Patrick McManus, +Roberto Peon, +Ryan Hamilton, +Subodh Iyengar, +Tatsuhiro Tsujikawa, +Ted Hardie, +Tom Jones, +and Victor Vasiliev. diff --git a/ietf.json b/ietf.json index 57b7dd0d4f..e34889ab06 100644 --- a/ietf.json +++ b/ietf.json @@ -1,11 +1,12 @@ { "group": "quic", - "primary": true, - "group_name": "QUIC", - "group_type": "wg", - "group_email": "quic@ietf.org", + "group_info": { + "name": "QUIC", + "type": "wg", + "email": "quic@ietf.org", + "activity_exclude_labels": ["editorial"] + }, "repo_type": "specs", "revisions_tagged": true, - "report_to": "group_email", - "report_exclude_labels": ["editorial"] + "activity_summary_to": ["group_email"] } \ No newline at end of file diff --git a/initial-protection.js b/protection-samples.js similarity index 90% rename from initial-protection.js rename to protection-samples.js index 785772d3d5..183b74f160 100755 --- a/initial-protection.js +++ b/protection-samples.js @@ -15,6 +15,8 @@ var SHA256 = 'sha256'; var AES_GCM = 'aes-128-gcm'; var AES_ECB = 'aes-128-ecb'; +var version = 'ff000019'; + function log(m, k) { console.log(m + ' [' + k.length + ']: ' + k.toString('hex')); }; @@ -252,11 +254,32 @@ function test(role, cid, hdr, pn, body) { } } -var version = 'ff000017' +function hex_cid(cid) { + return '0' + (cid.length / 2).toString(16) + cid; +} + +function retry(dcid, scid, odcid) { + var pfx = Buffer.from(hex_cid(odcid), 'hex'); + var encoded = Buffer.from('ff' + version + hex_cid(dcid) + hex_cid(scid), 'hex'); + var token = Buffer.from('token', 'ascii'); + var header = Buffer.concat([encoded, token]); + log('retry header', header); + var aad = Buffer.concat([pfx, header]); + log('retry aad', aad); + + var key = Buffer.from('4d32ecdb2a2133c841e4043df27d4430', 'hex'); + var nonce = Buffer.from('4d1611d05513a552c587d575', 'hex'); + + var gcm = crypto.createCipheriv(AES_GCM, key, nonce); + gcm.setAAD(aad); + gcm.update(''); + gcm.final(); + log('retry', Buffer.concat([header, gcm.getAuthTag()])); +} + var cid = '8394c8f03e515708'; -var dcidl = '0' + (cid.length / 2).toString(16); -var ci_hdr = 'c3' + version + dcidl + cid + '0000'; +var ci_hdr = 'c3' + version + hex_cid(cid) + '0000'; // This is a client Initial. Unfortunately, the ClientHello currently omits // the transport_parameters extension. var crypto_frame = '060040c4' + @@ -277,6 +300,7 @@ var frames = '0d0000000018410a' + '690b84d08a60993c144eca684d1081287c834d5311' + 'bcf32bb9da1a002b00020304'; var scid = 'f067a5502a4262b5'; -var scidl = '0' + (scid.length / 2).toString(16); -var si_hdr = 'c1' + version + '00' + scidl + scid + '00'; +var si_hdr = 'c1' + version + '00' + hex_cid(scid) + '00'; test('server', cid, si_hdr, 1, frames); + +retry('', scid, cid);