From 9e141aa70207bc127c9b49426013a04458c0ec88 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Wed, 5 Jul 2017 10:52:37 -0700 Subject: [PATCH 01/11] real quick sketch of what changes are necessary to remove family URL's various signatures. All tests passing, no problem. --- hyperlink/_url.py | 32 ++++++++++++++++---------------- hyperlink/test/test_url.py | 20 ++++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index e1d115a0..bf8a9dca 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -531,15 +531,14 @@ def parse_host(host): >>> parse_host('googlewebsite.com') == (None, 'googlewebsite.com') True - >>> parse_host('[::1]') == (socket.AF_INET6, '::1') + >>> parse_host('::1') == (socket.AF_INET6, '::1') True >>> parse_host('192.168.1.1') == (socket.AF_INET, '192.168.1.1') True """ if not host: return None, u'' - if u':' in host and u'[' == host[0] and u']' == host[-1]: - host = host[1:-1] + if u':' in host: try: inet_pton(socket.AF_INET6, host) except socket.error as se: @@ -617,7 +616,7 @@ class URL(object): """ def __init__(self, scheme=None, host=None, path=(), query=(), fragment=u'', - port=None, rooted=None, userinfo=u'', family=None, uses_netloc=None): + port=None, rooted=None, userinfo=u'', uses_netloc=None): if host is not None and scheme is None: scheme = u'http' # TODO: why if port is None: @@ -645,7 +644,7 @@ def __init__(self, scheme=None, host=None, path=(), query=(), fragment=u'', ' "-", and "." allowed. Did you meant to call' ' %s.from_text()?' % (self._scheme, self.__class__.__name__)) - self._host = _textcheck("host", host, '/?#@') + family, self._host = parse_host(_textcheck('host', host, '/?#@')) if isinstance(path, unicode): raise TypeError("expected iterable of text for path, not: %r" % (path,)) @@ -660,10 +659,9 @@ def __init__(self, scheme=None, host=None, path=(), query=(), fragment=u'', self._port = _typecheck("port", port, int, NoneType) self._rooted = _typecheck("rooted", rooted, bool) self._userinfo = _textcheck("userinfo", userinfo, '/?#@') - self._family = _typecheck("family", family, - type(socket.AF_INET), NoneType) - if ':' in self._host and self._family != socket.AF_INET6: - raise ValueError('invalid ":" present in host: %r' % self._host) + + # if ':' in self._host and self._family != socket.AF_INET6: + # raise ValueError('invalid ":" present in host: %r' % self._host) uses_netloc = scheme_uses_netloc(self._scheme, uses_netloc) self._uses_netloc = _typecheck("uses_netloc", @@ -811,8 +809,9 @@ def authority(self, with_password=False, **kw): with_password = kw.pop('includeSecrets', with_password) if kw: raise TypeError('got unexpected keyword arguments: %r' % kw.keys()) - if self.family == socket.AF_INET6: - hostport = ['[' + self.host + ']'] + host = self.host + if ':' in host: + hostport = ['[' + host + ']'] else: hostport = [self.host] if self.port != SCHEME_PORT_MAP.get(self.scheme): @@ -830,7 +829,7 @@ def __eq__(self, other): if not isinstance(other, self.__class__): return NotImplemented for attr in ['scheme', 'userinfo', 'host', 'query', - 'fragment', 'port', 'family', 'uses_netloc']: + 'fragment', 'port', 'uses_netloc']: if getattr(self, attr) != getattr(other, attr): return False if self.path == other.path or (self.path in _ROOT_PATHS @@ -846,7 +845,7 @@ def __ne__(self, other): def __hash__(self): return hash((self.__class__, self.scheme, self.userinfo, self.host, self.path, self.query, self.fragment, self.port, - self.rooted, self.family, self.uses_netloc)) + self.rooted, self.uses_netloc)) @property def absolute(self): @@ -907,7 +906,6 @@ def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET, port=_optional(port, self.port), rooted=_optional(rooted, self.rooted), userinfo=_optional(userinfo, self.userinfo), - family=_optional(family, self.family), uses_netloc=_optional(uses_netloc, self.uses_netloc) ) @@ -970,7 +968,9 @@ def from_text(cls, text): raise URLParseError('port must not be empty') raise URLParseError('expected integer for port, not %r' % port_str) - family, host = parse_host(host) + # _, host = parse_host(host) + if host: + host = host.lstrip('[').rstrip(']') scheme = gs['scheme'] or u'' fragment = gs['fragment'] or u'' @@ -992,7 +992,7 @@ def from_text(cls, text): else: query = () return cls(scheme, host, path, query, fragment, port, - rooted, userinfo, family, uses_netloc) + rooted, userinfo, uses_netloc) def child(self, *segments): """Make a new :class:`URL` where the given path segments are a child diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index a5af642a..9b43bbb2 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -856,7 +856,7 @@ def test_ipv6_with_port(self): url = URL.from_text(t) assert url.host == '2001:0db8:85a3:0000:0000:8a2e:0370:7334' assert url.port == 80 - assert url.family == socket.AF_INET6 + # assert url.family == socket.AF_INET6 assert SCHEME_PORT_MAP[url.scheme] != url.port def test_invalid_ipv6(self): @@ -871,15 +871,15 @@ def test_invalid_ipv6(self): socket.AF_INET6, ip) self.assertRaises(URLParseError, URL.from_text, url_text) - def test_ip_family_detection(self): - u = URL.from_text('http://giggle.com') - self.assertEqual(u.family, None) - - u = URL.from_text('http://127.0.0.1/a/b/?c=d') - self.assertEqual(u.family, socket.AF_INET) - - u = URL.from_text('http://[::1]/a/b/?c=d') - self.assertEqual(u.family, socket.AF_INET6) + # def test_ip_family_detection(self): + # u = URL.from_text('http://giggle.com') + # self.assertEqual(u.family, None) + # + # u = URL.from_text('http://127.0.0.1/a/b/?c=d') + # self.assertEqual(u.family, socket.AF_INET) + # + # u = URL.from_text('http://[::1]/a/b/?c=d') + # self.assertEqual(u.family, socket.AF_INET6) def test_invalid_port(self): self.assertRaises(URLParseError, URL.from_text, 'ftp://portmouth:smash') From 6d8ee41cf2dcd016d3d3246ccde8bd67c3a14b49 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Thu, 6 Jul 2017 23:09:39 -0700 Subject: [PATCH 02/11] more family cleanup --- hyperlink/_url.py | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index bf8a9dca..c42201b6 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -601,9 +601,6 @@ class URL(object): rooted (bool): Whether or not the path begins with a slash. userinfo (unicode): The username or colon-separated username:password pair. - family: A socket module constant used when the host is an - IP constant to differentiate IPv4 and domain names, as - well as validate IPv6. uses_netloc (bool): Indicates whether two slashes appear between the scheme and the host (``http://eg.com`` vs ``mailto:e@g.com``). Set automatically based on scheme. @@ -644,7 +641,7 @@ def __init__(self, scheme=None, host=None, path=(), query=(), fragment=u'', ' "-", and "." allowed. Did you meant to call' ' %s.from_text()?' % (self._scheme, self.__class__.__name__)) - family, self._host = parse_host(_textcheck('host', host, '/?#@')) + _, self._host = parse_host(_textcheck('host', host, '/?#@')) if isinstance(path, unicode): raise TypeError("expected iterable of text for path, not: %r" % (path,)) @@ -660,9 +657,6 @@ def __init__(self, scheme=None, host=None, path=(), query=(), fragment=u'', self._rooted = _typecheck("rooted", rooted, bool) self._userinfo = _textcheck("userinfo", userinfo, '/?#@') - # if ':' in self._host and self._family != socket.AF_INET6: - # raise ValueError('invalid ":" present in host: %r' % self._host) - uses_netloc = scheme_uses_netloc(self._scheme, uses_netloc) self._uses_netloc = _typecheck("uses_netloc", uses_netloc, bool, NoneType) @@ -765,15 +759,6 @@ def userinfo(self): """ return self._userinfo - @property - def family(self): - """Set to a socket constant (:data:`socket.AF_INET` or - :data:`socket.AF_INET6`) when the :attr:`~hyperlink.URL.host` - is an IP address. Set to ``None`` if the host is a domain name or - not set. - """ - return self._family - @property def uses_netloc(self): """ @@ -864,7 +849,7 @@ def absolute(self): def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET, fragment=_UNSET, port=_UNSET, rooted=_UNSET, userinfo=_UNSET, - family=_UNSET, uses_netloc=_UNSET): + uses_netloc=_UNSET): """:class:`URL` objects are immutable, which means that attributes are designed to be set only once, at construction. Instead of modifying an existing URL, one simply creates a copy with the @@ -885,9 +870,6 @@ def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET, rooted (bool): Whether or not the path begins with a slash. userinfo (unicode): The username or colon-separated username:password pair. - family: A socket module constant used when the host is an - IP constant to differentiate IPv4 and domain names, as - well as validate IPv6. uses_netloc (bool): Indicates whether two slashes appear between the scheme and the host (``http://eg.com`` vs ``mailto:e@g.com``) From 89c61a8fe46daa4b045668ea23d4616a0db2cde2 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Fri, 7 Jul 2017 21:00:12 -0700 Subject: [PATCH 03/11] clean up some commented-out code --- hyperlink/_url.py | 1 - hyperlink/test/test_url.py | 11 ----------- 2 files changed, 12 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index c42201b6..96a56dce 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -950,7 +950,6 @@ def from_text(cls, text): raise URLParseError('port must not be empty') raise URLParseError('expected integer for port, not %r' % port_str) - # _, host = parse_host(host) if host: host = host.lstrip('[').rstrip(']') diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index 9b43bbb2..3fadf9a6 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -856,7 +856,6 @@ def test_ipv6_with_port(self): url = URL.from_text(t) assert url.host == '2001:0db8:85a3:0000:0000:8a2e:0370:7334' assert url.port == 80 - # assert url.family == socket.AF_INET6 assert SCHEME_PORT_MAP[url.scheme] != url.port def test_invalid_ipv6(self): @@ -871,16 +870,6 @@ def test_invalid_ipv6(self): socket.AF_INET6, ip) self.assertRaises(URLParseError, URL.from_text, url_text) - # def test_ip_family_detection(self): - # u = URL.from_text('http://giggle.com') - # self.assertEqual(u.family, None) - # - # u = URL.from_text('http://127.0.0.1/a/b/?c=d') - # self.assertEqual(u.family, socket.AF_INET) - # - # u = URL.from_text('http://[::1]/a/b/?c=d') - # self.assertEqual(u.family, socket.AF_INET6) - def test_invalid_port(self): self.assertRaises(URLParseError, URL.from_text, 'ftp://portmouth:smash') self.assertRaises(ValueError, URL.from_text, From 8fe1532552044322f27147b860e4e5f9d5e7b402 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sat, 8 Jul 2017 00:26:10 -0700 Subject: [PATCH 04/11] wip: in-progress regexing for authority parsing --- hyperlink/_url.py | 23 +++++++++++++++++++++-- hyperlink/test/test_url.py | 3 ++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 96a56dce..aea7eb7d 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -122,13 +122,32 @@ def __nonzero__(self): # URL parsing regex (based on RFC 3986 Appendix B, with modifications) _URL_RE = re.compile(r'^((?P[^:/?#]+):)?' - r'((?P<_netloc_sep>//)(?P[^/?#]*))?' + r'((?P<_netloc_sep>//)' + r'(?P' + r'(?P[^@/?#]*@)?' + r'(?P(\[[^\[]/?#]*\])|([^:/?#[\]]*))?' + r':?(?P\d+)?' + r'))?' r'(?P[^?#]*)' r'(\?(?P[^#]*))?' r'(#(?P.*))?') _SCHEME_RE = re.compile(r'^[a-zA-Z0-9+-.]*$') +_URL_RE_PATT = (r'^((?P[^:/?#]+):)?' + r'((?P<_netloc_sep>//)' + r'(?P' + r'(?P[^@/?#]*@)?' + r'(?P(?P\[[^[\]/?#]*\])|(?P[^:/?#[\]]*))?' + r':?(?P\d+)?' + r'))?' # close authority group + r'(?P[^?#]*)' + r'(\?(?P[^#]*))?' + r'(#(?P.*))?') +_URL_RE = re.compile(_URL_RE_PATT) +# ^((?P[^:/?#]+):)?((?P<_netloc_sep>//)(?P(?P[^@/?#]*@)?(?P(?P\[[^\[]/?#]*\])|(?P[^\[]:/?#]*))?:?(?P\d+)?))?(?P[^?#]*)(\?(?P[^#]*))?(#(?P.*))? + + _HEX_CHAR_MAP = dict([((a + b).encode('ascii'), unichr(int(a + b, 16)).encode('charmap')) for a in string.hexdigits for b in string.hexdigits]) @@ -641,6 +660,7 @@ def __init__(self, scheme=None, host=None, path=(), query=(), fragment=u'', ' "-", and "." allowed. Did you meant to call' ' %s.from_text()?' % (self._scheme, self.__class__.__name__)) + _, self._host = parse_host(_textcheck('host', host, '/?#@')) if isinstance(path, unicode): raise TypeError("expected iterable of text for path, not: %r" @@ -952,7 +972,6 @@ def from_text(cls, text): % port_str) if host: host = host.lstrip('[').rstrip(']') - scheme = gs['scheme'] or u'' fragment = gs['fragment'] or u'' uses_netloc = bool(gs['_netloc_sep']) diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index 3fadf9a6..86ad6c28 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -10,7 +10,8 @@ from .common import HyperlinkTestCase from .. import URL, URLParseError # automatically import the py27 windows implementation when appropriate -from .._url import inet_pton, SCHEME_PORT_MAP +from .. import _url +from .._url import inet_pton, SCHEME_PORT_MAP, parse_host unicode = type(u'') From 4b8df125a6d0946b612503640a0b38205e02ca64 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 9 Jul 2017 01:03:37 -0700 Subject: [PATCH 05/11] adding a small "basic" test to test_url for quick sanity testing --- hyperlink/test/test_url.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index 86ad6c28..b95b89c4 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -859,6 +859,29 @@ def test_ipv6_with_port(self): assert url.port == 80 assert SCHEME_PORT_MAP[url.scheme] != url.port + def test_basic(self): + text = 'https://user:pass@example.com/path/to/here?k=v#nice' + url = URL.from_text(text) + assert url.scheme == 'https' + assert url.userinfo == 'user:pass' + assert url.host == 'example.com' + assert url.path == ('path', 'to', 'here') + assert url.fragment == 'nice' + + text = 'https://user:pass@127.0.0.1/path/to/here?k=v#nice' + url = URL.from_text(text) + assert url.scheme == 'https' + assert url.userinfo == 'user:pass' + assert url.host == '127.0.0.1' + assert url.path == ('path', 'to', 'here') + + text = 'https://user:pass@[::1]/path/to/here?k=v#nice' + url = URL.from_text(text) + assert url.scheme == 'https' + assert url.userinfo == 'user:pass' + assert url.host == '::1' + assert url.path == ('path', 'to', 'here') + def test_invalid_ipv6(self): invalid_ipv6_ips = ['2001::0234:C1ab::A0:aabc:003F', '2001::1::3F', From 3652cac04a0158a23832cad5c65be779363242ee Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 9 Jul 2017 01:05:45 -0700 Subject: [PATCH 06/11] switch authority parsing for URL.from_text to be regex-based, while preserving all the previous error messaging for common URL malformations (e.g., typos). this also reverts the primary URL regex to be much closer to the old RFC3986-based pattern --- hyperlink/_url.py | 71 ++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 41 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index aea7eb7d..0fcd93e1 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -120,31 +120,22 @@ def __nonzero__(self): _UNRESERVED_CHARS = frozenset('~-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz') + # URL parsing regex (based on RFC 3986 Appendix B, with modifications) _URL_RE = re.compile(r'^((?P[^:/?#]+):)?' r'((?P<_netloc_sep>//)' - r'(?P' - r'(?P[^@/?#]*@)?' - r'(?P(\[[^\[]/?#]*\])|([^:/?#[\]]*))?' - r':?(?P\d+)?' - r'))?' + r'(?P[^/?#]*))?' r'(?P[^?#]*)' r'(\?(?P[^#]*))?' - r'(#(?P.*))?') + r'(#(?P.*))?$') _SCHEME_RE = re.compile(r'^[a-zA-Z0-9+-.]*$') +_AUTHORITY_RE = re.compile(r'^(?:(?P[^@/?#]*)@)?' + r'(?P' + r'(?:\[(?P[^[\]/?#]*)\])' + r'|(?P[^:/?#[\]]*)' + r'|(?P.*?))?' + r'(?::(?P\d*))?$') - -_URL_RE_PATT = (r'^((?P[^:/?#]+):)?' - r'((?P<_netloc_sep>//)' - r'(?P' - r'(?P[^@/?#]*@)?' - r'(?P(?P\[[^[\]/?#]*\])|(?P[^:/?#[\]]*))?' - r':?(?P\d+)?' - r'))?' # close authority group - r'(?P[^?#]*)' - r'(\?(?P[^#]*))?' - r'(#(?P.*))?') -_URL_RE = re.compile(_URL_RE_PATT) # ^((?P[^:/?#]+):)?((?P<_netloc_sep>//)(?P(?P[^@/?#]*@)?(?P(?P\[[^\[]/?#]*\])|(?P[^\[]:/?#]*))?:?(?P\d+)?))?(?P[^?#]*)(\?(?P[^#]*))?(#(?P.*))? @@ -948,30 +939,28 @@ def from_text(cls, text): except AttributeError: raise URLParseError('could not parse url: %r' % text) - au_text = gs['authority'] - userinfo, hostinfo = u'', au_text + au_text = gs['authority'] or u'' + au_m = _AUTHORITY_RE.match(au_text) + try: + au_gs = au_m.groupdict() + except AttributeError: + raise URLParseError('invalid authority %r in url: %r' + % (au_text, text)) + if au_gs['bad_host']: + raise URLParseError('invalid host %r in url: %r') + + userinfo = au_gs['userinfo'] or u'' - if au_text: - userinfo, sep, hostinfo = au_text.rpartition('@') + host = au_gs['ipv6_host'] or au_gs['plain_host'] + port = au_gs['port'] + if port is not None: + try: + port = int(port) + except ValueError: + if not port: # TODO: excessive? + raise URLParseError('port must not be empty') + raise URLParseError('expected integer for port, not %r' % port) - host, port = None, None - if hostinfo: - host, sep, port_str = hostinfo.rpartition(u':') - if not sep: - host = port_str - else: - if u']' in port_str: - host = hostinfo # wrong split, was an ipv6 - else: - try: - port = int(port_str) - except ValueError: - if not port_str: # TODO: excessive? - raise URLParseError('port must not be empty') - raise URLParseError('expected integer for port, not %r' - % port_str) - if host: - host = host.lstrip('[').rstrip(']') scheme = gs['scheme'] or u'' fragment = gs['fragment'] or u'' uses_netloc = bool(gs['_netloc_sep']) @@ -985,7 +974,7 @@ def from_text(cls, text): rooted = False else: path = () - rooted = bool(hostinfo) + rooted = bool(au_text) if gs['query']: query = ((qe.split(u"=", 1) if u'=' in qe else (qe, None)) for qe in gs['query'].split(u"&")) From a5e365b10aac7e7d0361883901b753ca657142cd Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 9 Jul 2017 01:32:41 -0700 Subject: [PATCH 07/11] add a few bad authority tests --- hyperlink/_url.py | 3 ++- hyperlink/test/test_url.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 0fcd93e1..5eb6667e 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -671,6 +671,7 @@ def __init__(self, scheme=None, host=None, path=(), query=(), fragment=u'', uses_netloc = scheme_uses_netloc(self._scheme, uses_netloc) self._uses_netloc = _typecheck("uses_netloc", uses_netloc, bool, NoneType) + return @property @@ -958,7 +959,7 @@ def from_text(cls, text): port = int(port) except ValueError: if not port: # TODO: excessive? - raise URLParseError('port must not be empty') + raise URLParseError('port must not be empty: %r' % au_text) raise URLParseError('expected integer for port, not %r' % port) scheme = gs['scheme'] or u'' diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index b95b89c4..4e281a34 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -1080,3 +1080,10 @@ def test_from_text_type(self): assert URL.from_text(u'#ok').fragment == u'ok' # sanity self.assertRaises(TypeError, URL.from_text, b'bytes://x.y.z') self.assertRaises(TypeError, URL.from_text, object()) + + def test_from_text_bad_authority(self): + self.assertRaises(URLParseError, URL.from_text, 'http://[::1/') + self.assertRaises(URLParseError, URL.from_text, 'http://::1]/') + self.assertRaises(URLParseError, URL.from_text, 'http://[[::1]/') + self.assertRaises(URLParseError, URL.from_text, 'http://[::1]]/') + self.assertRaises(URLParseError, URL.from_text, 'http://127.0.0.1::80') From 1a3cbac862fa236f057c9426e5d2b0ef120482f2 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 9 Jul 2017 01:36:04 -0700 Subject: [PATCH 08/11] add an empty port test --- hyperlink/test/test_url.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index 4e281a34..7d025b33 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -1082,8 +1082,13 @@ def test_from_text_type(self): self.assertRaises(TypeError, URL.from_text, object()) def test_from_text_bad_authority(self): + # bad ipv6 parentheses self.assertRaises(URLParseError, URL.from_text, 'http://[::1/') self.assertRaises(URLParseError, URL.from_text, 'http://::1]/') self.assertRaises(URLParseError, URL.from_text, 'http://[[::1]/') self.assertRaises(URLParseError, URL.from_text, 'http://[::1]]/') + + # empty port + self.assertRaises(URLParseError, URL.from_text, 'http://127.0.0.1:') + # extra port colon (makes for an invalid host) self.assertRaises(URLParseError, URL.from_text, 'http://127.0.0.1::80') From d7021afa95e5e588f05dd212138318764796a45e Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 9 Jul 2017 02:42:48 -0700 Subject: [PATCH 09/11] add a test for a non-integer port --- hyperlink/_url.py | 2 +- hyperlink/test/test_url.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 5eb6667e..595c3a44 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -134,7 +134,7 @@ def __nonzero__(self): r'(?:\[(?P[^[\]/?#]*)\])' r'|(?P[^:/?#[\]]*)' r'|(?P.*?))?' - r'(?::(?P\d*))?$') + r'(?::(?P.*))?$') # ^((?P[^:/?#]+):)?((?P<_netloc_sep>//)(?P(?P[^@/?#]*@)?(?P(?P\[[^\[]/?#]*\])|(?P[^\[]:/?#]*))?:?(?P\d+)?))?(?P[^?#]*)(\?(?P[^#]*))?(#(?P.*))? diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index 7d025b33..d2e8fbef 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -1090,5 +1090,7 @@ def test_from_text_bad_authority(self): # empty port self.assertRaises(URLParseError, URL.from_text, 'http://127.0.0.1:') + # non-integer port + self.assertRaises(URLParseError, URL.from_text, 'http://127.0.0.1:hi') # extra port colon (makes for an invalid host) self.assertRaises(URLParseError, URL.from_text, 'http://127.0.0.1::80') From a515a87db556d6eac5c1ab58c227f94076892826 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 16 Jul 2017 15:11:05 -0700 Subject: [PATCH 10/11] added a corner case test of text that does not pass the url regex --- hyperlink/_url.py | 2 -- hyperlink/test/test_url.py | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 595c3a44..cda28a5a 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -136,8 +136,6 @@ def __nonzero__(self): r'|(?P.*?))?' r'(?::(?P.*))?$') -# ^((?P[^:/?#]+):)?((?P<_netloc_sep>//)(?P(?P[^@/?#]*@)?(?P(?P\[[^\[]/?#]*\])|(?P[^\[]:/?#]*))?:?(?P\d+)?))?(?P[^?#]*)(\?(?P[^#]*))?(#(?P.*))? - _HEX_CHAR_MAP = dict([((a + b).encode('ascii'), unichr(int(a + b, 16)).encode('charmap')) diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index d2e8fbef..975eab04 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -882,6 +882,9 @@ def test_basic(self): assert url.host == '::1' assert url.path == ('path', 'to', 'here') + def test_invalid_url(self): + self.assertRaises(URLParseError, URL.from_text, '#\n\n') + def test_invalid_ipv6(self): invalid_ipv6_ips = ['2001::0234:C1ab::A0:aabc:003F', '2001::1::3F', From 1605a1590ae5b500604638ca3e7cc0d6879d5813 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Mon, 17 Jul 2017 19:58:46 -0700 Subject: [PATCH 11/11] add an invalid authority for a bit more coverage --- hyperlink/test/test_url.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index 975eab04..f779c54a 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -885,6 +885,9 @@ def test_basic(self): def test_invalid_url(self): self.assertRaises(URLParseError, URL.from_text, '#\n\n') + def test_invalid_authority_url(self): + self.assertRaises(URLParseError, URL.from_text, 'http://abc:\n\n/#') + def test_invalid_ipv6(self): invalid_ipv6_ips = ['2001::0234:C1ab::A0:aabc:003F', '2001::1::3F',