From 24dbe887e2c27146cd39327761302486dc06f859 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Fri, 26 Feb 2016 21:14:59 +0300 Subject: [PATCH] Add tests by @redapple, do urljoin on unicode strings. --- tests/test_html.py | 22 ++++++++++++++++++++++ w3lib/html.py | 6 +++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/tests/test_html.py b/tests/test_html.py index 1c9a0035..5906f59c 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -286,6 +286,28 @@ def test_tag_name(self): """ self.assertEqual(get_base_url(text, baseurl), 'https://example.org') + def test_get_base_url_utf8(self): + baseurl = u'https://example.org' + + text = u""" + + Dummy + blahablsdfsal& + """ + self.assertEqual(get_base_url(text, baseurl), + 'http://example.org/snowman%E2%8D%A8') + + def test_get_base_url_latin1(self): + baseurl = u'https://example.org' + + text = u""" + + Dummy + blahablsdfsal& + """ + self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'), + 'http://example.org/sterling%A3') + class GetMetaRefreshTest(unittest.TestCase): def test_get_meta_refresh(self): diff --git a/w3lib/html.py b/w3lib/html.py index fbfc1ad9..23d99b16 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -281,11 +281,11 @@ def get_base_url(text, baseurl='', encoding='utf-8'): """ text = str_to_unicode(text, encoding) - baseurl = unicode_to_str(baseurl, encoding) m = _baseurl_re.search(text) if m: - baseurl = moves.urllib.parse.urljoin(baseurl, m.group(1).encode(encoding)) - return safe_url_string(baseurl) + baseurl = str_to_unicode(baseurl, encoding) + baseurl = moves.urllib.parse.urljoin(baseurl, m.group(1)) + return safe_url_string(unicode_to_str(baseurl, encoding)) def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): """Return the http-equiv parameter of the HTML meta element from the given