From c25e8b48eb5e805bd91695294170c02887e20e34 Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Thu, 14 Jul 2016 12:00:55 +0200 Subject: [PATCH 1/2] Add tests for URLs with non-IDNA-encodable netlocs --- tests/test_url.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_url.py b/tests/test_url.py index debedb77..7ac5d092 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -165,6 +165,20 @@ def test_safe_url_idna(self): safeurl = safe_url_string(safe_result) self.assertEqual(safeurl, safe_result) + def test_safe_url_idna_encoding_failure(self): + # missing DNS label + self.assertEqual( + safe_url_string(u"http://.example.com/résumé?q=résumé"), + "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") + + # DNS label too long + self.assertEqual( + safe_url_string( + u"http://www.{label}.com/résumé?q=résumé".format( + label=u"example"*11)), + "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format( + label=u"example"*11)) + def test_safe_download_url(self): self.assertEqual(safe_download_url('http://www.example.org'), 'http://www.example.org/') From 10d1faf213ac01e56e751cdac444b310bb4ab84e Mon Sep 17 00:00:00 2001 From: Paul Tremberth Date: Thu, 14 Jul 2016 12:01:21 +0200 Subject: [PATCH 2/2] Handle IDNA encoding failures for netloc part --- w3lib/url.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/w3lib/url.py b/w3lib/url.py index 2d0bc106..c3d84664 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -90,11 +90,18 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): parts = urlsplit(to_unicode(url, encoding=encoding, errors='percentencode')) + # IDNA encoding can fail for too long labels (>63 characters) + # or missing labels (e.g. http://.example.com) + try: + netloc = parts.netloc.encode('idna') + except UnicodeError: + netloc = parts.netloc + # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) return urlunsplit(( to_native_str(parts.scheme), - to_native_str(parts.netloc.encode('idna')), + to_native_str(netloc), # default encoding for path component SHOULD be UTF-8 quote(to_bytes(parts.path, path_encoding), _safe_chars),