From 24dbe887e2c27146cd39327761302486dc06f859 Mon Sep 17 00:00:00 2001
From: Konstantin Lopuhin <kostia.lopuhin@gmail.com>
Date: Fri, 26 Feb 2016 21:14:59 +0300
Subject: [PATCH 1/5] Add tests by @redapple, do urljoin on unicode strings.

---
 tests/test_html.py | 22 ++++++++++++++++++++++
 w3lib/html.py      |  6 +++---
 2 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/tests/test_html.py b/tests/test_html.py
index 1c9a0035..5906f59c 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -286,6 +286,28 @@ def test_tag_name(self):
             </html>"""
         self.assertEqual(get_base_url(text, baseurl), 'https://example.org')
 
+    def test_get_base_url_utf8(self):
+        baseurl = u'https://example.org'
+
+        text = u"""
+            <html>
+            <head><title>Dummy</title><base href='http://example.org/snowman\u2368' /></head>
+            <body>blahablsdfsal&amp;</body>
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl),
+                         'http://example.org/snowman%E2%8D%A8')
+
+    def test_get_base_url_latin1(self):
+        baseurl = u'https://example.org'
+
+        text = u"""
+            <html>
+            <head><title>Dummy</title><base href='http://example.org/sterling\u00a3' /></head>
+            <body>blahablsdfsal&amp;</body>
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'),
+                         'http://example.org/sterling%A3')
+
 
 class GetMetaRefreshTest(unittest.TestCase):
     def test_get_meta_refresh(self):
diff --git a/w3lib/html.py b/w3lib/html.py
index fbfc1ad9..23d99b16 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -281,11 +281,11 @@ def get_base_url(text, baseurl='', encoding='utf-8'):
     """
 
     text = str_to_unicode(text, encoding)
-    baseurl = unicode_to_str(baseurl, encoding)
     m = _baseurl_re.search(text)
     if m:
-        baseurl = moves.urllib.parse.urljoin(baseurl, m.group(1).encode(encoding))
-    return safe_url_string(baseurl)
+        baseurl = str_to_unicode(baseurl, encoding)
+        baseurl = moves.urllib.parse.urljoin(baseurl, m.group(1))
+    return safe_url_string(unicode_to_str(baseurl, encoding))
 
 def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
     """Return  the http-equiv parameter of the HTML meta element from the given

From 2c00a144c8a392ac14920f9ccb8abcf7b9044b36 Mon Sep 17 00:00:00 2001
From: Paul Tremberth <paul.tremberth@gmail.com>
Date: Tue, 22 Mar 2016 22:52:45 +0100
Subject: [PATCH 2/5] Fix tests on non-ASCII characters in URL + new
 safe_url_string()

---
 tests/test_html.py | 29 ++++++++++++++++--
 tests/test_url.py  | 75 ++++++++++++++++++++++++++++++++++++++++++----
 w3lib/html.py      | 35 ++++++++++++----------
 w3lib/url.py       | 65 +++++++++++++++++++++++++++-------------
 w3lib/util.py      | 32 ++++++++++++++++++++
 5 files changed, 192 insertions(+), 44 deletions(-)

diff --git a/tests/test_html.py b/tests/test_html.py
index 5906f59c..68133cb5 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -236,6 +236,7 @@ def test_unquote_markup(self):
 
 
 class GetBaseUrlTest(unittest.TestCase):
+
     def test_get_base_url(self):
         baseurl = u'https://example.org'
 
@@ -298,6 +299,8 @@ def test_get_base_url_utf8(self):
                          'http://example.org/snowman%E2%8D%A8')
 
     def test_get_base_url_latin1(self):
+        # page encoding does not affect URL path encoding before percent-escaping
+        # we should still use UTF-8 by default
         baseurl = u'https://example.org'
 
         text = u"""
@@ -306,7 +309,19 @@ def test_get_base_url_latin1(self):
             <body>blahablsdfsal&amp;</body>
             </html>"""
         self.assertEqual(get_base_url(text, baseurl, encoding='latin-1'),
-                         'http://example.org/sterling%A3')
+                         'http://example.org/sterling%C2%A3')
+
+    def test_get_base_url_latin1_percent(self):
+        # non-UTF-8 percent-encoded characters sequence are left untouched
+        baseurl = u'https://example.org'
+
+        text = u"""
+            <html>
+            <head><title>Dummy</title><base href='http://example.org/sterling%a3' /></head>
+            <body>blahablsdfsal&amp;</body>
+            </html>"""
+        self.assertEqual(get_base_url(text, baseurl),
+                         'http://example.org/sterling%a3')
 
 
 class GetMetaRefreshTest(unittest.TestCase):
@@ -357,10 +372,18 @@ def test_nonascii_url_utf8(self):
         self.assertEqual(get_meta_refresh(body, baseurl), (3, 'http://example.com/to%C2%A3'))
 
     def test_nonascii_url_latin1(self):
-        # non-ascii chars in the url (latin1)
+        # non-ascii chars in the url path (latin1)
+        # should end up UTF-8 encoded anyway
         baseurl = 'http://example.com'
         body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">"""
-        self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%A3'))
+        self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3'))
+
+    def test_nonascii_url_latin1_query(self):
+        # non-ascii chars in the url path and query (latin1)
+        # only query part should be kept latin1 encoded before percent escaping
+        baseurl = 'http://example.com'
+        body = b"""<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3?unit=\xb5">"""
+        self.assertEqual(get_meta_refresh(body, baseurl, 'latin1'), (3, 'http://example.com/to%C2%A3?unit=%B5'))
 
     def test_commented_meta_refresh(self):
         # html commented meta refresh header must not directed
diff --git a/tests/test_url.py b/tests/test_url.py
index 77641203..f0807534 100644
--- a/tests/test_url.py
+++ b/tests/test_url.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from __future__ import absolute_import
 import os
 import unittest
@@ -14,9 +15,13 @@ def test_safe_url_string(self):
                         '%E8%8D%89%E8%96%99%20%E7%B4%A0%E5%AD%90')
         self.assertEqual(safe_url_string(motoko),
                          safe_url_string(safe_url_string(motoko)))
-        self.assertEqual(safe_url_string(u'\xa9'), # copyright symbol
+        self.assertEqual(safe_url_string(u'©'), # copyright symbol
                          '%C2%A9')
-        self.assertEqual(safe_url_string(u'\xa9', 'iso-8859-1'),
+        # page-encoding does not affect URL path
+        self.assertEqual(safe_url_string(u'©', 'iso-8859-1'),
+                         '%C2%A9')
+        # path_encoding does
+        self.assertEqual(safe_url_string(u'©', path_encoding='iso-8859-1'),
                          '%A9')
         self.assertEqual(safe_url_string("http://www.example.org/"),
                         'http://www.example.org/')
@@ -31,16 +36,76 @@ def test_safe_url_string(self):
         self.assertEqual(safe_url_string("http://www.example.com/Brochures_&_Paint_Cards&PageSize=200"),
                                          "http://www.example.com/Brochures_&_Paint_Cards&PageSize=200")
 
-        safeurl = safe_url_string(u"http://www.example.com/\xa3", encoding='latin-1')
+        # page-encoding does not affect URL path
+        # we still end up UTF-8 encoding characters before percent-escaping
+        safeurl = safe_url_string(u"http://www.example.com/£")
         self.assertTrue(isinstance(safeurl, str))
-        self.assertEqual(safeurl, "http://www.example.com/%A3")
+        self.assertEqual(safeurl, "http://www.example.com/%C2%A3")
+
+        safeurl = safe_url_string(u"http://www.example.com/£", encoding='utf-8')
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%C2%A3")
 
-        safeurl = safe_url_string(u"http://www.example.com/\xa3", encoding='utf-8')
+        safeurl = safe_url_string(u"http://www.example.com/£", encoding='latin-1')
         self.assertTrue(isinstance(safeurl, str))
         self.assertEqual(safeurl, "http://www.example.com/%C2%A3")
 
+        safeurl = safe_url_string(u"http://www.example.com/£", path_encoding='latin-1')
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%A3")
+
         self.assertTrue(isinstance(safe_url_string(b'http://example.com/'), str))
 
+    def test_safe_url_string_with_query(self):
+        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ")
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
+
+        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='utf-8')
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
+
+        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1')
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")
+
+        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", path_encoding='latin-1')
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%C2%B5")
+
+        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1', path_encoding='latin-1')
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")
+
+    def test_safe_url_string_misc(self):
+        # mixing Unicode and percent-escaped sequences
+        safeurl = safe_url_string(u"http://www.example.com/£?unit=%C2%B5")
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
+
+        safeurl = safe_url_string(u"http://www.example.com/%C2%A3?unit=µ")
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")
+
+    def test_safe_url_string_bytes_input(self):
+        safeurl = safe_url_string(b"http://www.example.com/")
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/")
+
+        # bytes input is assumed to be UTF-8
+        safeurl = safe_url_string(b"http://www.example.com/\xc2\xb5")
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%C2%B5")
+
+        # page-encoding encoded bytes still end up as UTF-8 sequences in path
+        safeurl = safe_url_string(b"http://www.example.com/\xb5", encoding='latin1')
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%C2%B5")
+
+        safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5", encoding='latin1')
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")
+
     def test_safe_download_url(self):
         self.assertEqual(safe_download_url('http://www.example.org/../'),
                          'http://www.example.org/')
diff --git a/w3lib/html.py b/w3lib/html.py
index 23d99b16..24d01a55 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -8,7 +8,7 @@
 import six
 from six import moves
 
-from w3lib.util import str_to_unicode, unicode_to_str
+from w3lib.util import to_bytes, to_unicode
 from w3lib.url import safe_url_string
 
 _ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
@@ -91,10 +91,10 @@ def convert_entity(m):
 
         return u'' if remove_illegal and groups.get('semicolon') else m.group(0)
 
-    return _ent_re.sub(convert_entity, str_to_unicode(text, encoding))
+    return _ent_re.sub(convert_entity, to_unicode(text, encoding))
 
 def has_entities(text, encoding=None):
-    return bool(_ent_re.search(str_to_unicode(text, encoding)))
+    return bool(_ent_re.search(to_unicode(text, encoding)))
 
 def replace_tags(text, token='', encoding=None):
     """Replace all markup tags found in the given `text` by the given token.
@@ -116,7 +116,7 @@ def replace_tags(text, token='', encoding=None):
 
     """
 
-    return _tag_re.sub(token, str_to_unicode(text, encoding))
+    return _tag_re.sub(token, to_unicode(text, encoding))
 
 
 _REMOVECOMMENTS_RE = re.compile(u'<!--.*?-->', re.DOTALL)
@@ -130,7 +130,7 @@ def remove_comments(text, encoding=None):
 
     """
 
-    text = str_to_unicode(text, encoding)
+    text = to_unicode(text, encoding)
     return _REMOVECOMMENTS_RE.sub(u'', text)
 
 def remove_tags(text, which_ones=(), keep=(), encoding=None):
@@ -199,7 +199,7 @@ def remove_tag(m):
     regex = '</?([^ >/]+).*?>'
     retags = re.compile(regex, re.DOTALL | re.IGNORECASE)
 
-    return retags.sub(remove_tag, str_to_unicode(text, encoding))
+    return retags.sub(remove_tag, to_unicode(text, encoding))
 
 def remove_tags_with_content(text, which_ones=(), encoding=None):
     """Remove tags and their content.
@@ -215,7 +215,7 @@ def remove_tags_with_content(text, which_ones=(), encoding=None):
 
     """
 
-    text = str_to_unicode(text, encoding)
+    text = to_unicode(text, encoding)
     if which_ones:
         tags = '|'.join([r'<%s.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
         retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
@@ -235,9 +235,9 @@ def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
 
     """
 
-    text = str_to_unicode(text, encoding)
+    text = to_unicode(text, encoding)
     for ec in which_ones:
-        text = text.replace(ec, str_to_unicode(replace_by, encoding))
+        text = text.replace(ec, to_unicode(replace_by, encoding))
     return text
 
 def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
@@ -261,7 +261,7 @@ def _get_fragments(txt, pattern):
             offset = match_e
         yield txt[offset:]
 
-    text = str_to_unicode(text, encoding)
+    text = to_unicode(text, encoding)
     ret_text = u''
     for fragment in _get_fragments(text, _cdata_re):
         if isinstance(fragment, six.string_types):
@@ -280,12 +280,15 @@ def get_base_url(text, baseurl='', encoding='utf-8'):
 
     """
 
-    text = str_to_unicode(text, encoding)
+    text = to_unicode(text, encoding)
     m = _baseurl_re.search(text)
     if m:
-        baseurl = str_to_unicode(baseurl, encoding)
-        baseurl = moves.urllib.parse.urljoin(baseurl, m.group(1))
-    return safe_url_string(unicode_to_str(baseurl, encoding))
+        return moves.urllib.parse.urljoin(
+            safe_url_string(baseurl),
+            safe_url_string(m.group(1), encoding=encoding)
+        )
+    else:
+        return safe_url_string(baseurl)
 
 def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
     """Return  the http-equiv parameter of the HTML meta element from the given
@@ -298,9 +301,9 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
     """
 
     if six.PY2:
-        baseurl = unicode_to_str(baseurl, encoding)
+        baseurl = to_bytes(baseurl, encoding)
     try:
-        text = str_to_unicode(text, encoding)
+        text = to_unicode(text, encoding)
     except UnicodeDecodeError:
         print(text)
         raise
diff --git a/w3lib/url.py b/w3lib/url.py
index 43932405..c6af1530 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -7,8 +7,11 @@
 import posixpath
 import warnings
 import six
-from six import moves
-from w3lib.util import unicode_to_str
+from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
+                                    urldefrag, urlencode, urlparse,
+                                    quote, parse_qs, parse_qsl)
+from six.moves.urllib.request import pathname2url, url2pathname
+from w3lib.util import to_bytes, to_native_str, to_unicode
 
 # Python 2.x urllib.always_safe become private in Python 3.x;
 # its content is copied here
@@ -47,13 +50,13 @@ def urljoin_rfc(base, ref, encoding='utf-8'):
 
     str_base = unicode_to_str(base, encoding)
     str_ref = unicode_to_str(ref, encoding)
-    return moves.urllib.parse.urljoin(str_base, str_ref)
+    return urljoin(str_base, str_ref)
 
 _reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax)
 _unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3
 _safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks
 
-def safe_url_string(url, encoding='utf8'):
+def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
     """Convert the given url into a legal URL by escaping unsafe characters
     according to RFC-3986.
 
@@ -67,9 +70,31 @@ def safe_url_string(url, encoding='utf8'):
 
     Always returns a str.
     """
-    s = unicode_to_str(url, encoding)
-    return moves.urllib.parse.quote(s, _safe_chars)
-
+    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
+    # so let's decode (to Unicode) using page encoding.
+    #
+    # it is assumed that a raw bytes input comes from the page
+    # corresponding to the encoding
+    #
+    # Note: if this assumption is wrong, this will fail;
+    #       in the general case, users are required to use Unicode
+    #       or safe ASCII bytes input
+    parts = urlsplit(to_unicode(url, encoding=encoding))
+
+    # quote() in Python2 return type follows input type;
+    # quote() in Python3 always returns Unicode (native str)
+    return urlunsplit((
+        to_native_str(parts.scheme),
+        to_native_str(parts.netloc),
+
+        # default encoding for path component SHOULD be UTF-8
+        quote(to_bytes(parts.path, path_encoding), _safe_chars),
+
+        # encoding of query and fragment follows page encoding
+        # or form-charset (if known and passed)
+        quote(to_bytes(parts.query, encoding), _safe_chars),
+        quote(to_bytes(parts.fragment, encoding), _safe_chars),
+    ))
 
 _parent_dirs = re.compile(r'/?(\.\./)+')
 
@@ -82,14 +107,14 @@ def safe_download_url(url):
     to be within the document root.
     """
     safe_url = safe_url_string(url)
-    scheme, netloc, path, query, _ = moves.urllib.parse.urlsplit(safe_url)
+    scheme, netloc, path, query, _ = urlsplit(safe_url)
     if path:
         path = _parent_dirs.sub('', posixpath.normpath(path))
         if url.endswith('/') and not path.endswith('/'):
             path += '/'
     else:
         path = '/'
-    return moves.urllib.parse.urlunsplit((scheme, netloc, path, query, ''))
+    return urlunsplit((scheme, netloc, path, query, ''))
 
 def is_url(text):
     return text.partition("://")[0] in ('file', 'http', 'https')
@@ -123,8 +148,8 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
 
     """
 
-    queryparams = moves.urllib.parse.parse_qs(
-        moves.urllib.parse.urlsplit(str(url))[3],
+    queryparams = parse_qs(
+        urlsplit(str(url))[3],
         keep_blank_values=keep_blank_values
     )
     return queryparams.get(parameter, [default])[0]
@@ -157,7 +182,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
 
     if isinstance(parameterlist, (six.text_type, bytes)):
         parameterlist = [parameterlist]
-    url = moves.urllib.parse.urldefrag(url)[0]
+    url = urldefrag(url)[0]
     base, _, query = url.partition('?')
     seen = set()
     querylist = []
@@ -187,8 +212,8 @@ def add_or_replace_parameter(url, name, new_value):
     >>>
 
     """
-    parsed = moves.urllib.parse.urlsplit(url)
-    args = moves.urllib.parse.parse_qsl(parsed.query, keep_blank_values=True)
+    parsed = urlsplit(url)
+    args = parse_qsl(parsed.query, keep_blank_values=True)
 
     new_args = []
     found = False
@@ -202,15 +227,15 @@ def add_or_replace_parameter(url, name, new_value):
     if not found:
         new_args.append((name, new_value))
 
-    query = moves.urllib.parse.urlencode(new_args)
-    return moves.urllib.parse.urlunsplit(parsed._replace(query=query))
+    query = urlencode(new_args)
+    return urlunsplit(parsed._replace(query=query))
 
 
 def path_to_file_uri(path):
     """Convert local filesystem path to legal File URIs as described in:
     http://en.wikipedia.org/wiki/File_URI_scheme
     """
-    x = moves.urllib.request.pathname2url(os.path.abspath(path))
+    x = pathname2url(os.path.abspath(path))
     if os.name == 'nt':
         x = x.replace('|', ':') # http://bugs.python.org/issue5861
     return 'file:///%s' % x.lstrip('/')
@@ -219,8 +244,8 @@ def file_uri_to_path(uri):
     """Convert File URI to local filesystem path according to:
     http://en.wikipedia.org/wiki/File_URI_scheme
     """
-    uri_path = moves.urllib.parse.urlparse(uri).path
-    return moves.urllib.request.url2pathname(uri_path)
+    uri_path = urlparse(uri).path
+    return url2pathname(uri_path)
 
 def any_to_uri(uri_or_path):
     """If given a path name, return its File URI, otherwise return it
@@ -228,5 +253,5 @@ def any_to_uri(uri_or_path):
     """
     if os.path.splitdrive(uri_or_path)[0]:
         return path_to_file_uri(uri_or_path)
-    u = moves.urllib.parse.urlparse(uri_or_path)
+    u = urlparse(uri_or_path)
     return uri_or_path if u.scheme else path_to_file_uri(uri_or_path)
diff --git a/w3lib/util.py b/w3lib/util.py
index 09035b80..d8513eef 100644
--- a/w3lib/util.py
+++ b/w3lib/util.py
@@ -13,3 +13,35 @@ def unicode_to_str(text, encoding=None, errors='strict'):
     if isinstance(text, six.text_type):
         return text.encode(encoding, errors)
     return text
+
+def to_unicode(text, encoding=None, errors='strict'):
+    """Return the unicode representation of a bytes object `text`. If `text`
+    is already an unicode object, return it as-is."""
+    if isinstance(text, six.text_type):
+        return text
+    if not isinstance(text, (bytes, six.text_type)):
+        raise TypeError('to_unicode must receive a bytes, str or unicode '
+                        'object, got %s' % type(text).__name__)
+    if encoding is None:
+        encoding = 'utf-8'
+    return text.decode(encoding, errors)
+
+def to_bytes(text, encoding=None, errors='strict'):
+    """Return the binary representation of `text`. If `text`
+    is already a bytes object, return it as-is."""
+    if isinstance(text, bytes):
+        return text
+    if not isinstance(text, six.string_types):
+        raise TypeError('to_bytes must receive a unicode, str or bytes '
+                        'object, got %s' % type(text).__name__)
+    if encoding is None:
+        encoding = 'utf-8'
+    return text.encode(encoding, errors)
+
+def to_native_str(text, encoding=None, errors='strict'):
+    """ Return str representation of `text`
+    (bytes in Python 2.x and unicode in Python 3.x). """
+    if six.PY2:
+        return to_bytes(text, encoding, errors)
+    else:
+        return to_unicode(text, encoding, errors)

From 5daebcda3f91a3ec5404c3c38d4855ad4c92d619 Mon Sep 17 00:00:00 2001
From: Paul Tremberth <paul.tremberth@gmail.com>
Date: Wed, 23 Mar 2016 10:21:26 +0100
Subject: [PATCH 3/5] Update safe_url_string() docstring

---
 w3lib/url.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/w3lib/url.py b/w3lib/url.py
index c6af1530..3eae56b0 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -57,18 +57,20 @@ def urljoin_rfc(base, ref, encoding='utf-8'):
 _safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks
 
 def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
-    """Convert the given url into a legal URL by escaping unsafe characters
+    """Convert the given URL into a legal URL by escaping unsafe characters
     according to RFC-3986.
 
-    If a unicode url is given, it is first converted to str using the given
-    encoding (which defaults to 'utf-8'). When passing a encoding, you should
-    use the encoding of the original page (the page from which the url was
-    extracted from).
+    If a bytes URL is given, it is first converted to `str` using the given
+    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
+    URL path component (unless overriden by path_encoding), and given
+    encoding is used for query string or form data.
+    When passing a encoding, you should use the encoding of the
+    original page (the page from which the url was extracted from).
 
-    Calling this function on an already "safe" url will return the url
+    Calling this function on an already "safe" URL will return the URL
     unmodified.
 
-    Always returns a str.
+    Always returns a native `str` (bytes in Python2, unicode in Python3).
     """
     # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
     # so let's decode (to Unicode) using page encoding.

From 3253677571b23d627dbaf7499723cd118963b6ab Mon Sep 17 00:00:00 2001
From: Paul Tremberth <paul.tremberth@gmail.com>
Date: Wed, 23 Mar 2016 16:45:37 +0100
Subject: [PATCH 4/5] Remove deprecated urljoin_rfc()

Was deprecated since v1.1:
https://github.com/scrapy/w3lib/blob/v1.1/w3lib/url.py
---
 w3lib/url.py | 36 +-----------------------------------
 1 file changed, 1 insertion(+), 35 deletions(-)

diff --git a/w3lib/url.py b/w3lib/url.py
index 3eae56b0..ac5de7e3 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -7,7 +7,7 @@
 import posixpath
 import warnings
 import six
-from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
+from six.moves.urllib.parse import (urlsplit, urlunsplit,
                                     urldefrag, urlencode, urlparse,
                                     quote, parse_qs, parse_qsl)
 from six.moves.urllib.request import pathname2url, url2pathname
@@ -18,40 +18,6 @@
 _ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                       b'abcdefghijklmnopqrstuvwxyz'
                       b'0123456789' b'_.-')
-
-
-def urljoin_rfc(base, ref, encoding='utf-8'):
-    r"""
-    .. warning::
-
-        This function is deprecated and will be removed in future.
-        Please use ``urlparse.urljoin`` instead.
-
-    Same as urlparse.urljoin but supports unicode values in base and ref
-    parameters (in which case they will be converted to str using the given
-    encoding).
-
-    Always returns a str.
-
-    >>> import w3lib.url
-    >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
-    'http://www.example.com/otherpath/index2.html'
-    >>>
-
-    >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm')
-    'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
-    >>>
-
-
-    """
-
-    warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
-        DeprecationWarning)
-
-    str_base = unicode_to_str(base, encoding)
-    str_ref = unicode_to_str(ref, encoding)
-    return urljoin(str_base, str_ref)
-
 _reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax)
 _unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3
 _safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks

From b8b90555a92ba7d8a54495a5d2b3447539c57b0e Mon Sep 17 00:00:00 2001
From: Paul Tremberth <paul.tremberth@gmail.com>
Date: Fri, 25 Mar 2016 18:59:19 +0100
Subject: [PATCH 5/5] Support Internationalized Domain Names with
 safe_url_string()

---
 tests/test_url.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 w3lib/url.py      |  2 +-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/tests/test_url.py b/tests/test_url.py
index f0807534..9d73d1ee 100644
--- a/tests/test_url.py
+++ b/tests/test_url.py
@@ -106,6 +106,52 @@ def test_safe_url_string_bytes_input(self):
         self.assertTrue(isinstance(safeurl, str))
         self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")
 
+    def test_safe_url_idna(self):
+        # adapted from:
+        # https://ssl.icu-project.org/icu-bin/idnbrowser
+        # http://unicode.org/faq/idn.html
+        # + various others
+        websites = (
+            (u'http://www.färgbolaget.nu/färgbolaget', 'http://www.xn--frgbolaget-q5a.nu/f%C3%A4rgbolaget'),
+            (u'http://www.räksmörgås.se/?räksmörgås=yes', 'http://www.xn--rksmrgs-5wao1o.se/?r%C3%A4ksm%C3%B6rg%C3%A5s=yes'),
+            (u'http://www.brændendekærlighed.com/brændende/kærlighed', 'http://www.xn--brndendekrlighed-vobh.com/br%C3%A6ndende/k%C3%A6rlighed'),
+            (u'http://www.예비교사.com', 'http://www.xn--9d0bm53a3xbzui.com'),
+            (u'http://理容ナカムラ.com', 'http://xn--lck1c3crb1723bpq4a.com'),
+            (u'http://あーるいん.com', 'http://xn--l8je6s7a45b.com'),
+
+            # --- real websites ---
+
+            # in practice, this redirect (301) to http://www.buecher.de/?q=b%C3%BCcher
+            (u'http://www.bücher.de/?q=bücher', 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher'),
+
+            # Japanese
+            (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'),
+
+            # Russian
+            (u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'),
+            (u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'),
+
+            # Korean
+            (u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'),
+            (u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'),
+
+            # Arabic
+            (u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'),
+
+            # Chinese
+            (u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'),
+            (u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'),
+            (u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'),
+        )
+        for idn_input, safe_result in websites:
+            safeurl = safe_url_string(idn_input)
+            self.assertEqual(safeurl, safe_result)
+
+        # make sure the safe URL is unchanged when made safe a 2nd time
+        for _, safe_result in websites:
+            safeurl = safe_url_string(safe_result)
+            self.assertEqual(safeurl, safe_result)
+
     def test_safe_download_url(self):
         self.assertEqual(safe_download_url('http://www.example.org/../'),
                          'http://www.example.org/')
diff --git a/w3lib/url.py b/w3lib/url.py
index ac5de7e3..3bac3d35 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -53,7 +53,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
     # quote() in Python3 always returns Unicode (native str)
     return urlunsplit((
         to_native_str(parts.scheme),
-        to_native_str(parts.netloc),
+        to_native_str(parts.netloc.encode('idna')),
 
         # default encoding for path component SHOULD be UTF-8
         quote(to_bytes(parts.path, path_encoding), _safe_chars),