From 8a101321f6aaa24dd796241221f81a92bf116e9f Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sat, 27 May 2017 22:02:08 +0100 Subject: [PATCH] Another attempt to fix encoding issues (#4486) When reading bytes from subprocesses, use the locale encoding. Don't fail if the data is encoded incorrectly - instead, use the backslashreplace error handler (and warn the user). --- docs/reference/pip.rst | 19 ++++++++++ news/4486.bugfix | 1 + pip/compat.py | 77 ++++++++++++++++++++++++++++++++++----- tests/unit/test_compat.py | 28 +++++++++++++- 4 files changed, 114 insertions(+), 11 deletions(-) create mode 100644 news/4486.bugfix diff --git a/docs/reference/pip.rst b/docs/reference/pip.rst index d14e6db095c..84e95dab26a 100644 --- a/docs/reference/pip.rst +++ b/docs/reference/pip.rst @@ -90,6 +90,25 @@ before invoking ``setup.py``. The injection should be transparent to ``setup.py`` emulating the commands pip requires may need to be aware that it takes place. +Build System Output +~~~~~~~~~~~~~~~~~~~ + +Any output produced by the build system will be read by pip (for display to the +user if requested). In order to correctly read the build system output, pip +requires that the output is written in a well-defined encoding, specifically +the encoding the user has configured for text output (which can be obtained in +Python using ``locale.getpreferredencoding``). If the configured encoding is +ASCII, pip assumes UTF-8 (to account for the behaviour of some Unix systems). + +Build systems should ensure that any tools they invoke (compilers, etc) produce +output in the correct encoding. In practice - and in particular on Windows, +where tools are inconsistent in their use of the "OEM" and "ANSI" codepages - +this may not always be possible. Pip will therefore attempt to recover cleanly +if presented with incorrectly encoded build tool output, by translating +unexpected byte sequences to Python-style hexadecimal escape sequences +(``"\x80\xff"``, etc). However, it is still possible for output to be displayed +using an incorrect encoding (mojibake). + Future Developments ~~~~~~~~~~~~~~~~~~~ diff --git a/news/4486.bugfix b/news/4486.bugfix new file mode 100644 index 00000000000..9f92ddd5535 --- /dev/null +++ b/news/4486.bugfix @@ -0,0 +1 @@ +Improve handling of text output from build tools (avoid Unicode errors) diff --git a/pip/compat.py b/pip/compat.py index 3682fd451a4..71bf8e6225a 100644 --- a/pip/compat.py +++ b/pip/compat.py @@ -4,6 +4,9 @@ import os import sys +import codecs +import locale +import logging from pip._vendor.six import text_type @@ -24,6 +27,8 @@ ] +logger = logging.getLogger(__name__) + if sys.version_info >= (3, 4): uses_pycache = True from importlib.util import cache_from_source @@ -36,22 +41,76 @@ cache_from_source = None -if sys.version_info >= (3,): - def console_to_str(s): - try: - return s.decode(sys.__stdout__.encoding) - except UnicodeDecodeError: - return s.decode('utf_8') +if sys.version_info >= (3, 5): + backslashreplace_decode = "backslashreplace" +else: + # In version 3.4 and older, backslashreplace exists + # but does not support use for decoding. + # We implement our own replace handler for this + # situation, so that we can consistently use + # backslash replacement for all versions. + def backslashreplace_decode_fn(err): + raw_bytes = (err.object[i] for i in range(err.start, err.end)) + if sys.version_info[0] == 2: + # Python 2 gave us characters - convert to numeric bytes + raw_bytes = (ord(b) for b in raw_bytes) + return u"".join(u"\\x%x" % c for c in raw_bytes), err.end + codecs.register_error( + "backslashreplace_decode", + backslashreplace_decode_fn) + backslashreplace_decode = "backslashreplace_decode" + + +def console_to_str(data): + """Return a string, safe for output, of subprocess output. + + We assume the data is in the locale preferred encoding. + If it won't decode properly, we warn the user but decode as + best we can. + + We also ensure that the output can be safely written to + standard output without encoding errors. + """ + + # First, get the encoding we assume. This is the preferred + # encoding for the locale, unless that is not found, or + # it is ASCII, in which case assume UTF-8 + encoding = locale.getpreferredencoding() + if (not encoding) or codecs.lookup(encoding).name == "ascii": + encoding = "utf-8" + # Now try to decode the data - if we fail, warn the user and + # decode with replacement. + try: + s = data.decode(encoding) + except UnicodeDecodeError: + logger.warning( + "Subprocess output does not appear to be encoded as %s" % + encoding) + s = data.decode(encoding, errors=backslashreplace_decode) + + # Make sure we can print the output, by encoding it to the output + # encoding with replacement of unencodable characters, and then + # decoding again. + # We use stderr's encoding because it's less likely to be + # redirected and if we don't find an encoding we skip this + # step (on the assumption that output is wrapped by something + # that won't fail). + output_encoding = sys.__stderr__.encoding + if output_encoding: + s = s.encode(output_encoding, errors="backslashreplace") + s = s.decode(output_encoding) + + return s + + +if sys.version_info >= (3,): def native_str(s, replace=False): if isinstance(s, bytes): return s.decode('utf-8', 'replace' if replace else 'strict') return s else: - def console_to_str(s): - return s - def native_str(s, replace=False): # Replace is ignored -- unicode to UTF-8 can't fail if isinstance(s, text_type): diff --git a/tests/unit/test_compat.py b/tests/unit/test_compat.py index 1d092c33110..2e896294f3e 100644 --- a/tests/unit/test_compat.py +++ b/tests/unit/test_compat.py @@ -1,7 +1,8 @@ +import locale import os - +import pip.compat import pytest -from pip.compat import expanduser, get_path_uid, native_str +from pip.compat import expanduser, get_path_uid, native_str, console_to_str def test_get_path_uid(): @@ -40,6 +41,29 @@ def test_get_path_uid_symlink_without_NOFOLLOW(tmpdir, monkeypatch): get_path_uid(fs) +def test_console_to_str(monkeypatch): + some_bytes = b"a\xE9\xC3\xE9b" + encodings = ('ascii', 'utf-8', 'iso-8859-1', 'iso-8859-5', + 'koi8_r', 'cp850') + for e in encodings: + monkeypatch.setattr(locale, 'getpreferredencoding', lambda: e) + result = console_to_str(some_bytes) + assert result.startswith("a") + assert result.endswith("b") + + +def test_console_to_str_warning(monkeypatch): + some_bytes = b"a\xE9b" + + def check_warning(msg): + assert msg.startswith( + "Subprocess output does not appear to be encoded as") + + monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8') + monkeypatch.setattr(pip.compat.logger, 'warning', check_warning) + console_to_str(some_bytes) + + def test_to_native_str_type(): some_bytes = b"test\xE9 et approuv\xC3\xE9" some_unicode = b"test\xE9 et approuv\xE9".decode('iso-8859-15')