From 8a101321f6aaa24dd796241221f81a92bf116e9f Mon Sep 17 00:00:00 2001
From: Paul Moore <p.f.moore@gmail.com>
Date: Sat, 27 May 2017 22:02:08 +0100
Subject: [PATCH] Another attempt to fix encoding issues (#4486)

When reading bytes from subprocesses, use the locale encoding. Don't fail if the data is encoded incorrectly - instead, use the backslashreplace error handler (and warn the user).
---
 docs/reference/pip.rst    | 19 ++++++++++
 news/4486.bugfix          |  1 +
 pip/compat.py             | 77 ++++++++++++++++++++++++++++++++++-----
 tests/unit/test_compat.py | 28 +++++++++++++-
 4 files changed, 114 insertions(+), 11 deletions(-)
 create mode 100644 news/4486.bugfix

diff --git a/docs/reference/pip.rst b/docs/reference/pip.rst
index d14e6db095c..84e95dab26a 100644
--- a/docs/reference/pip.rst
+++ b/docs/reference/pip.rst
@@ -90,6 +90,25 @@ before invoking ``setup.py``. The injection should be transparent to
 ``setup.py`` emulating the commands pip requires may need to be aware that it
 takes place.
 
+Build System Output
+~~~~~~~~~~~~~~~~~~~
+
+Any output produced by the build system will be read by pip (for display to the
+user if requested). In order to correctly read the build system output, pip
+requires that the output is written in a well-defined encoding, specifically
+the encoding the user has configured for text output (which can be obtained in
+Python using ``locale.getpreferredencoding``). If the configured encoding is
+ASCII, pip assumes UTF-8 (to account for the behaviour of some Unix systems).
+
+Build systems should ensure that any tools they invoke (compilers, etc) produce
+output in the correct encoding. In practice - and in particular on Windows,
+where tools are inconsistent in their use of the "OEM" and "ANSI" codepages -
+this may not always be possible. Pip will therefore attempt to recover cleanly
+if presented with incorrectly encoded build tool output, by translating
+unexpected byte sequences to Python-style hexadecimal escape sequences
+(``"\x80\xff"``, etc). However, it is still possible for output to be displayed
+using an incorrect encoding (mojibake).
+
 Future Developments
 ~~~~~~~~~~~~~~~~~~~
 
diff --git a/news/4486.bugfix b/news/4486.bugfix
new file mode 100644
index 00000000000..9f92ddd5535
--- /dev/null
+++ b/news/4486.bugfix
@@ -0,0 +1 @@
+Improve handling of text output from build tools (avoid Unicode errors)
diff --git a/pip/compat.py b/pip/compat.py
index 3682fd451a4..71bf8e6225a 100644
--- a/pip/compat.py
+++ b/pip/compat.py
@@ -4,6 +4,9 @@
 
 import os
 import sys
+import codecs
+import locale
+import logging
 
 from pip._vendor.six import text_type
 
@@ -24,6 +27,8 @@
 ]
 
 
+logger = logging.getLogger(__name__)
+
 if sys.version_info >= (3, 4):
     uses_pycache = True
     from importlib.util import cache_from_source
@@ -36,22 +41,76 @@
         cache_from_source = None
 
 
-if sys.version_info >= (3,):
-    def console_to_str(s):
-        try:
-            return s.decode(sys.__stdout__.encoding)
-        except UnicodeDecodeError:
-            return s.decode('utf_8')
+if sys.version_info >= (3, 5):
+    backslashreplace_decode = "backslashreplace"
+else:
+    # In version 3.4 and older, backslashreplace exists
+    # but does not support use for decoding.
+    # We implement our own replace handler for this
+    # situation, so that we can consistently use
+    # backslash replacement for all versions.
+    def backslashreplace_decode_fn(err):
+        raw_bytes = (err.object[i] for i in range(err.start, err.end))
+        if sys.version_info[0] == 2:
+            # Python 2 gave us characters - convert to numeric bytes
+            raw_bytes = (ord(b) for b in raw_bytes)
+        return u"".join(u"\\x%x" % c for c in raw_bytes), err.end
+    codecs.register_error(
+        "backslashreplace_decode",
+        backslashreplace_decode_fn)
+    backslashreplace_decode = "backslashreplace_decode"
+
+
+def console_to_str(data):
+    """Return a string, safe for output, of subprocess output.
+
+    We assume the data is in the locale preferred encoding.
+    If it won't decode properly, we warn the user but decode as
+    best we can.
+
+    We also ensure that the output can be safely written to
+    standard output without encoding errors.
+    """
+
+    # First, get the encoding we assume. This is the preferred
+    # encoding for the locale, unless that is not found, or
+    # it is ASCII, in which case assume UTF-8
+    encoding = locale.getpreferredencoding()
+    if (not encoding) or codecs.lookup(encoding).name == "ascii":
+        encoding = "utf-8"
 
+    # Now try to decode the data - if we fail, warn the user and
+    # decode with replacement.
+    try:
+        s = data.decode(encoding)
+    except UnicodeDecodeError:
+        logger.warning(
+            "Subprocess output does not appear to be encoded as %s" %
+            encoding)
+        s = data.decode(encoding, errors=backslashreplace_decode)
+
+    # Make sure we can print the output, by encoding it to the output
+    # encoding with replacement of unencodable characters, and then
+    # decoding again.
+    # We use stderr's encoding because it's less likely to be
+    # redirected and if we don't find an encoding we skip this
+    # step (on the assumption that output is wrapped by something
+    # that won't fail).
+    output_encoding = sys.__stderr__.encoding
+    if output_encoding:
+        s = s.encode(output_encoding, errors="backslashreplace")
+        s = s.decode(output_encoding)
+
+    return s
+
+
+if sys.version_info >= (3,):
     def native_str(s, replace=False):
         if isinstance(s, bytes):
             return s.decode('utf-8', 'replace' if replace else 'strict')
         return s
 
 else:
-    def console_to_str(s):
-        return s
-
     def native_str(s, replace=False):
         # Replace is ignored -- unicode to UTF-8 can't fail
         if isinstance(s, text_type):
diff --git a/tests/unit/test_compat.py b/tests/unit/test_compat.py
index 1d092c33110..2e896294f3e 100644
--- a/tests/unit/test_compat.py
+++ b/tests/unit/test_compat.py
@@ -1,7 +1,8 @@
+import locale
 import os
-
+import pip.compat
 import pytest
-from pip.compat import expanduser, get_path_uid, native_str
+from pip.compat import expanduser, get_path_uid, native_str, console_to_str
 
 
 def test_get_path_uid():
@@ -40,6 +41,29 @@ def test_get_path_uid_symlink_without_NOFOLLOW(tmpdir, monkeypatch):
         get_path_uid(fs)
 
 
+def test_console_to_str(monkeypatch):
+    some_bytes = b"a\xE9\xC3\xE9b"
+    encodings = ('ascii', 'utf-8', 'iso-8859-1', 'iso-8859-5',
+                 'koi8_r', 'cp850')
+    for e in encodings:
+        monkeypatch.setattr(locale, 'getpreferredencoding', lambda: e)
+        result = console_to_str(some_bytes)
+        assert result.startswith("a")
+        assert result.endswith("b")
+
+
+def test_console_to_str_warning(monkeypatch):
+    some_bytes = b"a\xE9b"
+
+    def check_warning(msg):
+        assert msg.startswith(
+            "Subprocess output does not appear to be encoded as")
+
+    monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
+    monkeypatch.setattr(pip.compat.logger, 'warning', check_warning)
+    console_to_str(some_bytes)
+
+
 def test_to_native_str_type():
     some_bytes = b"test\xE9 et approuv\xC3\xE9"
     some_unicode = b"test\xE9 et approuv\xE9".decode('iso-8859-15')