diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 00000000..575c7671 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,46 @@ +from unittest import TestCase + +from pytest import deprecated_call, raises + +from w3lib.util import ( + str_to_unicode, + to_bytes, + to_native_str, + to_unicode, + unicode_to_str, +) + + +class StrToUnicodeTestCase(TestCase): + + def test_deprecation(self): + with deprecated_call(): + str_to_unicode('') + + +class ToBytesTestCase(TestCase): + + def test_type_error(self): + with raises(TypeError): + to_bytes(True) + + +class ToNativeStrTestCase(TestCase): + + def test_deprecation(self): + with deprecated_call(): + to_native_str('') + + +class ToUnicodeTestCase(TestCase): + + def test_type_error(self): + with raises(TypeError): + to_unicode(True) + + +class UnicodeToStrTestCase(TestCase): + + def test_deprecation(self): + with deprecated_call(): + unicode_to_str('') diff --git a/w3lib/url.py b/w3lib/url.py index 02841378..f0ac932d 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -24,12 +24,12 @@ urlunsplit, ) from urllib.request import pathname2url, url2pathname -from w3lib.util import to_bytes, to_native_str, to_unicode +from w3lib.util import to_unicode # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error): - return (to_unicode(quote(error.object[error.start:error.end])), error.end) + return (quote(error.object[error.start:error.end]), error.end) codecs.register_error('percentencode', _quote_byte) @@ -77,26 +77,22 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True) # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: - netloc = parts.netloc.encode('idna') + netloc = parts.netloc.encode('idna').decode() except UnicodeError: netloc = parts.netloc # default encoding for path component SHOULD be UTF-8 if quote_path: - path = quote(to_bytes(parts.path, path_encoding), _path_safe_chars) + path = quote(parts.path.encode(path_encoding), _path_safe_chars) else: - path = to_native_str(parts.path) + path = parts.path - # quote() in Python2 return type follows input type; - # quote() in Python3 always returns Unicode (native str) return urlunsplit(( - to_native_str(parts.scheme), - to_native_str(netloc).rstrip(':'), + parts.scheme, + netloc.rstrip(':'), path, - # encoding of query and fragment follows page encoding - # or form-charset (if known and passed) - quote(to_bytes(parts.query, encoding), _safe_chars), - quote(to_bytes(parts.fragment, encoding), _safe_chars), + quote(parts.query.encode(encoding), _safe_chars), + quote(parts.fragment.encode(encoding), _safe_chars), )) @@ -410,22 +406,17 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: - netloc = parts.netloc.encode('idna') + netloc = parts.netloc.encode('idna').decode() except UnicodeError: netloc = parts.netloc return ( - to_native_str(parts.scheme), - to_native_str(netloc), - - # default encoding for path component SHOULD be UTF-8 - quote(to_bytes(parts.path, path_encoding), _path_safe_chars), - quote(to_bytes(parts.params, path_encoding), _safe_chars), - - # encoding of query and fragment follows page encoding - # or form-charset (if known and passed) - quote(to_bytes(parts.query, encoding), _safe_chars), - quote(to_bytes(parts.fragment, encoding), _safe_chars) + parts.scheme, + netloc, + quote(parts.path.encode(path_encoding), _path_safe_chars), + quote(parts.params.encode(path_encoding), _safe_chars), + quote(parts.query.encode(encoding), _safe_chars), + quote(parts.fragment.encode(encoding), _safe_chars) ) @@ -466,7 +457,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, # if not for proper URL expected by remote website. try: scheme, netloc, path, params, query, fragment = _safe_ParseResult( - parse_url(url), encoding=encoding) + parse_url(url), encoding=encoding or 'utf8') except UnicodeEncodeError as e: scheme, netloc, path, params, query, fragment = _safe_ParseResult( parse_url(url), encoding='utf8') diff --git a/w3lib/util.py b/w3lib/util.py index 02deeeea..8beaac9c 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,4 +1,13 @@ +from warnings import warn + + def str_to_unicode(text, encoding=None, errors='strict'): + warn( + "The w3lib.utils.str_to_unicode function is deprecated and " + "will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) if encoding is None: encoding = 'utf-8' if isinstance(text, bytes): @@ -6,6 +15,12 @@ def str_to_unicode(text, encoding=None, errors='strict'): return text def unicode_to_str(text, encoding=None, errors='strict'): + warn( + "The w3lib.utils.unicode_to_str function is deprecated and " + "will be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) if encoding is None: encoding = 'utf-8' if isinstance(text, str): @@ -18,8 +33,9 @@ def to_unicode(text, encoding=None, errors='strict'): if isinstance(text, str): return text if not isinstance(text, (bytes, str)): - raise TypeError('to_unicode must receive a bytes, str or unicode ' - 'object, got %s' % type(text).__name__) + raise TypeError( + f'to_unicode must receive bytes or str, got {type(text).__name__}' + ) if encoding is None: encoding = 'utf-8' return text.decode(encoding, errors) @@ -30,12 +46,20 @@ def to_bytes(text, encoding=None, errors='strict'): if isinstance(text, bytes): return text if not isinstance(text, str): - raise TypeError('to_bytes must receive a unicode, str or bytes ' - 'object, got %s' % type(text).__name__) + raise TypeError( + f'to_bytes must receive str or bytes, got {type(text).__name__}' + ) if encoding is None: encoding = 'utf-8' return text.encode(encoding, errors) def to_native_str(text, encoding=None, errors='strict'): """ Return str representation of `text` """ + warn( + "The w3lib.utils.to_native_str function is deprecated and " + "will be removed in a future release. Please use " + "w3lib.utils.to_unicode instead.", + DeprecationWarning, + stacklevel=2, + ) return to_unicode(text, encoding, errors)