Skip to content

Commit f120792

Browse files
committed
Replace MIME parsing with custom HTTP parsing.
Given that websockets makes straightforward use of HTTP, that websocket implementations can be expected not to exhibit legacy behaviors, and that RFC 7230 deprecates this behavior, parsing HTTP is doable. Thanks https://github.com/njsmith/h11 for providing some inspiration, especially for translating the RFC to regular expressions and figuring out some edge cases. I expect the new implementation to be faster, since it has a much tighter focus than the stdlib's general purpose MIME parser, and possibly more secure, since it was written from the beginning with security as a primary goal (with the caveat that it's new code, which means it's more likely to have security issues). Fix #19.
1 parent b1d09a1 commit f120792

File tree

6 files changed

+155
-53
lines changed

6 files changed

+155
-53
lines changed

docs/changelog.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ Changelog
1414

1515
* Made read and write buffer sizes configurable.
1616

17+
* Rewrote HTTP handling for simplicity and performance.
18+
1719
3.3
1820
...
1921

websockets/client.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import asyncio
77
import collections.abc
8-
import email.message
8+
import http.client
99

1010
from .exceptions import InvalidHandshake, InvalidMessage
1111
from .handshake import build_request, check_response
@@ -35,9 +35,8 @@ def write_http_request(self, path, headers):
3535
3636
"""
3737
self.path = path
38-
self.request_headers = email.message.Message()
39-
for name, value in headers:
40-
self.request_headers[name] = value
38+
self.request_headers = http.client.HTTPMessage()
39+
self.request_headers._headers = headers # HACK
4140
self.raw_request_headers = headers
4241

4342
# Since the path and headers only contain ASCII characters,
@@ -63,10 +62,11 @@ def read_http_response(self):
6362
except ValueError as exc:
6463
raise InvalidMessage("Malformed HTTP message") from exc
6564

66-
self.response_headers = headers
67-
self.raw_response_headers = list(headers.raw_items())
65+
self.response_headers = http.client.HTTPMessage()
66+
self.response_headers._headers = headers # HACK
67+
self.raw_response_headers = headers
6868

69-
return status_code, headers
69+
return status_code, self.response_headers
7070

7171
def process_subprotocol(self, get_header, subprotocols=None):
7272
"""

websockets/http.py

Lines changed: 105 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88
"""
99

1010
import asyncio
11-
import email.parser
12-
import io
11+
import re
1312
import sys
1413

1514
from .version import version as websockets_version
@@ -26,6 +25,26 @@
2625
))
2726

2827

28+
# See https://tools.ietf.org/html/rfc7230#appendix-B.
29+
30+
# Regex for validating header names.
31+
32+
_token_re = re.compile(rb'^[-!#$%&\'*+.^_`|~0-9a-zA-Z]+$')
33+
34+
# Regex for validating header values.
35+
36+
# We don't attempt to support obsolete line folding.
37+
38+
# Include HTAB (\x09), SP (\x20), VCHAR (\x21-\x7e), obs-text (\x80-\xff).
39+
40+
# The ABNF is complicated because it attempts to express that optional
41+
# whitespace is ignored. We strip whitespace and don't revalidate that.
42+
43+
# See also https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189
44+
45+
_value_re = re.compile(rb'^[\x09\x20-\x7e\x80-\xff]*$')
46+
47+
2948
@asyncio.coroutine
3049
def read_request(stream):
3150
"""
@@ -34,20 +53,38 @@ def read_request(stream):
3453
``stream`` is an :class:`~asyncio.StreamReader`.
3554
3655
Return ``(path, headers)`` where ``path`` is a :class:`str` and
37-
``headers`` is a :class:`~email.message.Message`. ``path`` isn't
38-
URL-decoded.
56+
``headers`` is a list of ``(name, value)`` tuples.
57+
58+
``path`` isn't URL-decoded or validated in any way.
59+
60+
Non-ASCII characters are represented with surrogate escapes.
3961
4062
Raise an exception if the request isn't well formatted.
4163
4264
The request is assumed not to contain a body.
4365
4466
"""
45-
request_line, headers = yield from read_message(stream)
46-
method, path, version = request_line[:-2].decode().split(None, 2)
47-
if method != 'GET':
48-
raise ValueError("Unsupported method")
49-
if version != 'HTTP/1.1':
50-
raise ValueError("Unsupported HTTP version")
67+
# https://tools.ietf.org/html/rfc7230#section-3.1.1
68+
69+
# Parsing is simple because fixed values are expected for method and
70+
# version and because path isn't checked. Since WebSocket software tends
71+
# to implement HTTP/1.1 strictly, there's little need for lenient parsing.
72+
73+
# Given the implementation of read_line(), request_line ends with CRLF.
74+
request_line = yield from read_line(stream)
75+
76+
# This may raise "ValueError: not enough values to unpack"
77+
method, path, version = request_line[:-2].split(b' ', 2)
78+
79+
if method != b'GET':
80+
raise ValueError("Unsupported HTTP method: %r" % method)
81+
if version != b'HTTP/1.1':
82+
raise ValueError("Unsupported HTTP version: %r" % version)
83+
84+
path = path.decode('ascii', 'surrogateescape')
85+
86+
headers = yield from read_headers(stream)
87+
5188
return path, headers
5289

5390

@@ -59,45 +96,82 @@ def read_response(stream):
5996
``stream`` is an :class:`~asyncio.StreamReader`.
6097
6198
Return ``(status, headers)`` where ``status`` is a :class:`int` and
62-
``headers`` is a :class:`~email.message.Message`.
99+
``headers`` is a list of ``(name, value)`` tuples.
100+
101+
Non-ASCII characters are represented with surrogate escapes.
63102
64103
Raise an exception if the request isn't well formatted.
65104
66105
The response is assumed not to contain a body.
67106
68107
"""
69-
status_line, headers = yield from read_message(stream)
70-
version, status, reason = status_line[:-2].decode().split(" ", 2)
71-
if version != 'HTTP/1.1':
72-
raise ValueError("Unsupported HTTP version")
73-
return int(status), headers
108+
# https://tools.ietf.org/html/rfc7230#section-3.1.2
109+
110+
# As in read_request, parsing is simple because a fixed value is expected
111+
# for version, status is a 3-digit number, and reason can be ignored.
112+
113+
# Given the implementation of read_line(), status_line ends with CRLF.
114+
status_line = yield from read_line(stream)
115+
116+
# This may raise "ValueError: not enough values to unpack"
117+
version, status, reason = status_line[:-2].split(b' ', 2)
118+
119+
if version != b'HTTP/1.1':
120+
raise ValueError("Unsupported HTTP version: %r" % version)
121+
# This may raise "ValueError: invalid literal for int() with base 10"
122+
status = int(status)
123+
if not 100 <= status < 1000:
124+
raise ValueError("Unsupported HTTP status code: %d" % status)
125+
if not _value_re.match(reason):
126+
raise ValueError("Invalid HTTP reason phrase: %r" % reason)
127+
128+
headers = yield from read_headers(stream)
129+
130+
return status, headers
74131

75132

76133
@asyncio.coroutine
77-
def read_message(stream):
134+
def read_headers(stream):
78135
"""
79136
Read an HTTP message from ``stream``.
80137
81138
``stream`` is an :class:`~asyncio.StreamReader`.
82139
83140
Return ``(start_line, headers)`` where ``start_line`` is :class:`bytes`
84-
and ``headers`` is a :class:`~email.message.Message`.
141+
and ``headers`` is a list of ``(name, value)`` tuples.
142+
143+
Non-ASCII characters are represented with surrogate escapes.
85144
86145
The message is assumed not to contain a body.
87146
88147
"""
89-
start_line = yield from read_line(stream)
90-
header_lines = io.BytesIO()
91-
for num in range(MAX_HEADERS):
92-
header_line = yield from read_line(stream)
93-
header_lines.write(header_line)
94-
if header_line == b'\r\n':
148+
# https://tools.ietf.org/html/rfc7230#section-3.2
149+
150+
# We don't attempt to support obsolete line folding.
151+
152+
headers = []
153+
for _ in range(MAX_HEADERS):
154+
line = yield from read_line(stream)
155+
if line == b'\r\n':
95156
break
157+
158+
# This may raise "ValueError: not enough values to unpack"
159+
name, value = line[:-2].split(b':', 1)
160+
if not _token_re.match(name):
161+
raise ValueError("Invalid HTTP header name: %r" % name)
162+
value = value.strip(b' \t')
163+
if not _value_re.match(value):
164+
raise ValueError("Invalid HTTP header value: %r" % value)
165+
166+
headers.append((
167+
name.decode('ascii'), # guaranteed to be ASCII at this point
168+
value.decode('ascii', 'surrogateescape'),
169+
))
170+
96171
else:
97-
raise ValueError("Too many headers")
98-
header_lines.seek(0)
99-
headers = email.parser.BytesHeaderParser().parse(header_lines)
100-
return start_line, headers
172+
raise ValueError("Too many HTTP headers")
173+
174+
return headers
101175

102176

103177
@asyncio.coroutine
@@ -108,9 +182,12 @@ def read_line(stream):
108182
``stream`` is an :class:`~asyncio.StreamReader`.
109183
110184
"""
185+
# Security: this is bounded by the StreamReader's limit (default = 32kB).
111186
line = yield from stream.readline()
187+
# Security: this guarantees header values are small (hardcoded = 4kB)
112188
if len(line) > MAX_LINE:
113189
raise ValueError("Line too long")
190+
# Not mandatory but safe - https://tools.ietf.org/html/rfc7230#section-3.5
114191
if not line.endswith(b'\r\n'):
115192
raise ValueError("Line without CRLF")
116193
return line

websockets/protocol.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,13 @@ class WebSocketCommonProtocol(asyncio.StreamReaderProtocol):
9292
processed, the request path is available in the :attr:`path` attribute,
9393
and the request and response HTTP headers are available:
9494
95-
* as a MIME :class:`~email.message.Message` in the :attr:`request_headers`
95+
* as a :class:`~http.client.HTTPMessage` in the :attr:`request_headers`
9696
and :attr:`response_headers` attributes
9797
* as an iterable of (name, value) pairs in the :attr:`raw_request_headers`
9898
and :attr:`raw_response_headers` attributes
9999
100+
These attributes must be treated as immutable.
101+
100102
If a subprotocol was negotiated, it's available in the :attr:`subprotocol`
101103
attribute.
102104

websockets/server.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66

77
import asyncio
88
import collections.abc
9-
import email.message
10-
import http
9+
import http.client
1110
import logging
1211

1312
from .compatibility import asyncio_ensure_future
@@ -155,20 +154,20 @@ def read_http_request(self):
155154
raise InvalidMessage("Malformed HTTP message") from exc
156155

157156
self.path = path
158-
self.request_headers = headers
159-
self.raw_request_headers = list(headers.raw_items())
157+
self.request_headers = http.client.HTTPMessage()
158+
self.request_headers._headers = headers # HACK
159+
self.raw_request_headers = headers
160160

161-
return path, headers
161+
return path, self.request_headers
162162

163163
@asyncio.coroutine
164164
def write_http_response(self, status, headers):
165165
"""
166166
Write status line and headers to the HTTP response.
167167
168168
"""
169-
self.response_headers = email.message.Message()
170-
for name, value in headers:
171-
self.response_headers[name] = value
169+
self.response_headers = http.client.HTTPMessage()
170+
self.response_headers._headers = headers # HACK
172171
self.raw_response_headers = headers
173172

174173
# Since the status line and headers only contain ASCII characters,

websockets/test_http.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import unittest
33

44
from .http import *
5-
from .http import read_message # private API
5+
from .http import read_headers # private API
66

77

88
class HTTPTests(unittest.TestCase):
@@ -32,7 +32,7 @@ def test_read_request(self):
3232
)
3333
path, hdrs = self.loop.run_until_complete(read_request(self.stream))
3434
self.assertEqual(path, '/chat')
35-
self.assertEqual(hdrs['Upgrade'], 'websocket')
35+
self.assertEqual(dict(hdrs)['Upgrade'], 'websocket')
3636

3737
def test_read_response(self):
3838
# Example from the protocol overview in RFC 6455
@@ -46,32 +46,54 @@ def test_read_response(self):
4646
)
4747
status, hdrs = self.loop.run_until_complete(read_response(self.stream))
4848
self.assertEqual(status, 101)
49-
self.assertEqual(hdrs['Upgrade'], 'websocket')
49+
self.assertEqual(dict(hdrs)['Upgrade'], 'websocket')
5050

51-
def test_method(self):
51+
def test_request_method(self):
5252
self.stream.feed_data(b'OPTIONS * HTTP/1.1\r\n\r\n')
5353
with self.assertRaises(ValueError):
5454
self.loop.run_until_complete(read_request(self.stream))
5555

56-
def test_version(self):
56+
def test_request_version(self):
5757
self.stream.feed_data(b'GET /chat HTTP/1.0\r\n\r\n')
5858
with self.assertRaises(ValueError):
5959
self.loop.run_until_complete(read_request(self.stream))
60+
61+
def test_response_version(self):
6062
self.stream.feed_data(b'HTTP/1.0 400 Bad Request\r\n\r\n')
6163
with self.assertRaises(ValueError):
6264
self.loop.run_until_complete(read_response(self.stream))
6365

66+
def test_response_status(self):
67+
self.stream.feed_data(b'HTTP/1.1 007 My name is Bond\r\n\r\n')
68+
with self.assertRaises(ValueError):
69+
self.loop.run_until_complete(read_response(self.stream))
70+
71+
def test_response_reason(self):
72+
self.stream.feed_data(b'HTTP/1.1 200 \x7f\r\n\r\n')
73+
with self.assertRaises(ValueError):
74+
self.loop.run_until_complete(read_response(self.stream))
75+
76+
def test_header_name(self):
77+
self.stream.feed_data(b'foo bar: baz qux\r\n\r\n')
78+
with self.assertRaises(ValueError):
79+
self.loop.run_until_complete(read_headers(self.stream))
80+
81+
def test_header_value(self):
82+
self.stream.feed_data(b'foo: \x00\x00\x0f\r\n\r\n')
83+
with self.assertRaises(ValueError):
84+
self.loop.run_until_complete(read_headers(self.stream))
85+
6486
def test_headers_limit(self):
6587
self.stream.feed_data(b'foo: bar\r\n' * 500 + b'\r\n')
6688
with self.assertRaises(ValueError):
67-
self.loop.run_until_complete(read_message(self.stream))
89+
self.loop.run_until_complete(read_headers(self.stream))
6890

6991
def test_line_limit(self):
7092
self.stream.feed_data(b'a' * 5000 + b'\r\n\r\n')
7193
with self.assertRaises(ValueError):
72-
self.loop.run_until_complete(read_message(self.stream))
94+
self.loop.run_until_complete(read_headers(self.stream))
7395

7496
def test_line_ending(self):
75-
self.stream.feed_data(b'GET / HTTP/1.1\n\n')
97+
self.stream.feed_data(b'foo: bar\n\n')
7698
with self.assertRaises(ValueError):
77-
self.loop.run_until_complete(read_message(self.stream))
99+
self.loop.run_until_complete(read_headers(self.stream))

0 commit comments

Comments
 (0)