From 8a101321f6aaa24dd796241221f81a92bf116e9f Mon Sep 17 00:00:00 2001
From: Paul Moore
Date: Sat, 27 May 2017 22:02:08 +0100
Subject: [PATCH] Another attempt to fix encoding issues (#4486)
When reading bytes from subprocesses, use the locale encoding. Don't fail if the data is encoded incorrectly - instead, use the backslashreplace error handler (and warn the user).
---
docs/reference/pip.rst | 19 ++++++++++
news/4486.bugfix | 1 +
pip/compat.py | 77 ++++++++++++++++++++++++++++++++++-----
tests/unit/test_compat.py | 28 +++++++++++++-
4 files changed, 114 insertions(+), 11 deletions(-)
create mode 100644 news/4486.bugfix
diff --git a/docs/reference/pip.rst b/docs/reference/pip.rst
index d14e6db095c..84e95dab26a 100644
--- a/docs/reference/pip.rst
+++ b/docs/reference/pip.rst
@@ -90,6 +90,25 @@ before invoking ``setup.py``. The injection should be transparent to
``setup.py`` emulating the commands pip requires may need to be aware that it
takes place.
+Build System Output
+~~~~~~~~~~~~~~~~~~~
+
+Any output produced by the build system will be read by pip (for display to the
+user if requested). In order to correctly read the build system output, pip
+requires that the output is written in a well-defined encoding, specifically
+the encoding the user has configured for text output (which can be obtained in
+Python using ``locale.getpreferredencoding``). If the configured encoding is
+ASCII, pip assumes UTF-8 (to account for the behaviour of some Unix systems).
+
+Build systems should ensure that any tools they invoke (compilers, etc) produce
+output in the correct encoding. In practice - and in particular on Windows,
+where tools are inconsistent in their use of the "OEM" and "ANSI" codepages -
+this may not always be possible. Pip will therefore attempt to recover cleanly
+if presented with incorrectly encoded build tool output, by translating
+unexpected byte sequences to Python-style hexadecimal escape sequences
+(``"\x80\xff"``, etc). However, it is still possible for output to be displayed
+using an incorrect encoding (mojibake).
+
Future Developments
~~~~~~~~~~~~~~~~~~~
diff --git a/news/4486.bugfix b/news/4486.bugfix
new file mode 100644
index 00000000000..9f92ddd5535
--- /dev/null
+++ b/news/4486.bugfix
@@ -0,0 +1 @@
+Improve handling of text output from build tools (avoid Unicode errors)
diff --git a/pip/compat.py b/pip/compat.py
index 3682fd451a4..71bf8e6225a 100644
--- a/pip/compat.py
+++ b/pip/compat.py
@@ -4,6 +4,9 @@
import os
import sys
+import codecs
+import locale
+import logging
from pip._vendor.six import text_type
@@ -24,6 +27,8 @@
]
+logger = logging.getLogger(__name__)
+
if sys.version_info >= (3, 4):
uses_pycache = True
from importlib.util import cache_from_source
@@ -36,22 +41,76 @@
cache_from_source = None
-if sys.version_info >= (3,):
- def console_to_str(s):
- try:
- return s.decode(sys.__stdout__.encoding)
- except UnicodeDecodeError:
- return s.decode('utf_8')
+if sys.version_info >= (3, 5):
+ backslashreplace_decode = "backslashreplace"
+else:
+ # In version 3.4 and older, backslashreplace exists
+ # but does not support use for decoding.
+ # We implement our own replace handler for this
+ # situation, so that we can consistently use
+ # backslash replacement for all versions.
+ def backslashreplace_decode_fn(err):
+ raw_bytes = (err.object[i] for i in range(err.start, err.end))
+ if sys.version_info[0] == 2:
+ # Python 2 gave us characters - convert to numeric bytes
+ raw_bytes = (ord(b) for b in raw_bytes)
+ return u"".join(u"\\x%x" % c for c in raw_bytes), err.end
+ codecs.register_error(
+ "backslashreplace_decode",
+ backslashreplace_decode_fn)
+ backslashreplace_decode = "backslashreplace_decode"
+
+
+def console_to_str(data):
+ """Return a string, safe for output, of subprocess output.
+
+ We assume the data is in the locale preferred encoding.
+ If it won't decode properly, we warn the user but decode as
+ best we can.
+
+ We also ensure that the output can be safely written to
+ standard output without encoding errors.
+ """
+
+ # First, get the encoding we assume. This is the preferred
+ # encoding for the locale, unless that is not found, or
+ # it is ASCII, in which case assume UTF-8
+ encoding = locale.getpreferredencoding()
+ if (not encoding) or codecs.lookup(encoding).name == "ascii":
+ encoding = "utf-8"
+ # Now try to decode the data - if we fail, warn the user and
+ # decode with replacement.
+ try:
+ s = data.decode(encoding)
+ except UnicodeDecodeError:
+ logger.warning(
+ "Subprocess output does not appear to be encoded as %s" %
+ encoding)
+ s = data.decode(encoding, errors=backslashreplace_decode)
+
+ # Make sure we can print the output, by encoding it to the output
+ # encoding with replacement of unencodable characters, and then
+ # decoding again.
+ # We use stderr's encoding because it's less likely to be
+ # redirected and if we don't find an encoding we skip this
+ # step (on the assumption that output is wrapped by something
+ # that won't fail).
+ output_encoding = sys.__stderr__.encoding
+ if output_encoding:
+ s = s.encode(output_encoding, errors="backslashreplace")
+ s = s.decode(output_encoding)
+
+ return s
+
+
+if sys.version_info >= (3,):
def native_str(s, replace=False):
if isinstance(s, bytes):
return s.decode('utf-8', 'replace' if replace else 'strict')
return s
else:
- def console_to_str(s):
- return s
-
def native_str(s, replace=False):
# Replace is ignored -- unicode to UTF-8 can't fail
if isinstance(s, text_type):
diff --git a/tests/unit/test_compat.py b/tests/unit/test_compat.py
index 1d092c33110..2e896294f3e 100644
--- a/tests/unit/test_compat.py
+++ b/tests/unit/test_compat.py
@@ -1,7 +1,8 @@
+import locale
import os
-
+import pip.compat
import pytest
-from pip.compat import expanduser, get_path_uid, native_str
+from pip.compat import expanduser, get_path_uid, native_str, console_to_str
def test_get_path_uid():
@@ -40,6 +41,29 @@ def test_get_path_uid_symlink_without_NOFOLLOW(tmpdir, monkeypatch):
get_path_uid(fs)
+def test_console_to_str(monkeypatch):
+ some_bytes = b"a\xE9\xC3\xE9b"
+ encodings = ('ascii', 'utf-8', 'iso-8859-1', 'iso-8859-5',
+ 'koi8_r', 'cp850')
+ for e in encodings:
+ monkeypatch.setattr(locale, 'getpreferredencoding', lambda: e)
+ result = console_to_str(some_bytes)
+ assert result.startswith("a")
+ assert result.endswith("b")
+
+
+def test_console_to_str_warning(monkeypatch):
+ some_bytes = b"a\xE9b"
+
+ def check_warning(msg):
+ assert msg.startswith(
+ "Subprocess output does not appear to be encoded as")
+
+ monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
+ monkeypatch.setattr(pip.compat.logger, 'warning', check_warning)
+ console_to_str(some_bytes)
+
+
def test_to_native_str_type():
some_bytes = b"test\xE9 et approuv\xC3\xE9"
some_unicode = b"test\xE9 et approuv\xE9".decode('iso-8859-15')