Skip to content

Commit

Permalink
Another attempt to fix encoding issues (#4486)
Browse files Browse the repository at this point in the history
When reading bytes from subprocesses, use the locale encoding. Don't fail if the data is encoded incorrectly - instead, use the backslashreplace error handler (and warn the user).
  • Loading branch information
pfmoore committed May 27, 2017
1 parent a3ce000 commit 8a10132
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 11 deletions.
19 changes: 19 additions & 0 deletions docs/reference/pip.rst
Expand Up @@ -90,6 +90,25 @@ before invoking ``setup.py``. The injection should be transparent to
``setup.py`` emulating the commands pip requires may need to be aware that it
takes place.

Build System Output
~~~~~~~~~~~~~~~~~~~

Any output produced by the build system will be read by pip (for display to the
user if requested). In order to correctly read the build system output, pip
requires that the output is written in a well-defined encoding, specifically
the encoding the user has configured for text output (which can be obtained in
Python using ``locale.getpreferredencoding``). If the configured encoding is
ASCII, pip assumes UTF-8 (to account for the behaviour of some Unix systems).

Build systems should ensure that any tools they invoke (compilers, etc) produce
output in the correct encoding. In practice - and in particular on Windows,
where tools are inconsistent in their use of the "OEM" and "ANSI" codepages -
this may not always be possible. Pip will therefore attempt to recover cleanly
if presented with incorrectly encoded build tool output, by translating
unexpected byte sequences to Python-style hexadecimal escape sequences
(``"\x80\xff"``, etc). However, it is still possible for output to be displayed
using an incorrect encoding (mojibake).

Future Developments
~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions news/4486.bugfix
@@ -0,0 +1 @@
Improve handling of text output from build tools (avoid Unicode errors)
77 changes: 68 additions & 9 deletions pip/compat.py
Expand Up @@ -4,6 +4,9 @@

import os
import sys
import codecs
import locale
import logging

from pip._vendor.six import text_type

Expand All @@ -24,6 +27,8 @@
]


logger = logging.getLogger(__name__)

if sys.version_info >= (3, 4):
uses_pycache = True
from importlib.util import cache_from_source
Expand All @@ -36,22 +41,76 @@
cache_from_source = None


if sys.version_info >= (3,):
def console_to_str(s):
try:
return s.decode(sys.__stdout__.encoding)
except UnicodeDecodeError:
return s.decode('utf_8')
if sys.version_info >= (3, 5):
backslashreplace_decode = "backslashreplace"
else:
# In version 3.4 and older, backslashreplace exists
# but does not support use for decoding.
# We implement our own replace handler for this
# situation, so that we can consistently use
# backslash replacement for all versions.
def backslashreplace_decode_fn(err):
raw_bytes = (err.object[i] for i in range(err.start, err.end))
if sys.version_info[0] == 2:
# Python 2 gave us characters - convert to numeric bytes
raw_bytes = (ord(b) for b in raw_bytes)
return u"".join(u"\\x%x" % c for c in raw_bytes), err.end
codecs.register_error(
"backslashreplace_decode",
backslashreplace_decode_fn)
backslashreplace_decode = "backslashreplace_decode"


def console_to_str(data):
"""Return a string, safe for output, of subprocess output.
We assume the data is in the locale preferred encoding.
If it won't decode properly, we warn the user but decode as
best we can.
We also ensure that the output can be safely written to
standard output without encoding errors.
"""

# First, get the encoding we assume. This is the preferred
# encoding for the locale, unless that is not found, or
# it is ASCII, in which case assume UTF-8
encoding = locale.getpreferredencoding()
if (not encoding) or codecs.lookup(encoding).name == "ascii":
encoding = "utf-8"

# Now try to decode the data - if we fail, warn the user and
# decode with replacement.
try:
s = data.decode(encoding)
except UnicodeDecodeError:
logger.warning(
"Subprocess output does not appear to be encoded as %s" %
encoding)
s = data.decode(encoding, errors=backslashreplace_decode)

# Make sure we can print the output, by encoding it to the output
# encoding with replacement of unencodable characters, and then
# decoding again.
# We use stderr's encoding because it's less likely to be
# redirected and if we don't find an encoding we skip this
# step (on the assumption that output is wrapped by something
# that won't fail).
output_encoding = sys.__stderr__.encoding
if output_encoding:
s = s.encode(output_encoding, errors="backslashreplace")
s = s.decode(output_encoding)

return s


if sys.version_info >= (3,):
def native_str(s, replace=False):
if isinstance(s, bytes):
return s.decode('utf-8', 'replace' if replace else 'strict')
return s

else:
def console_to_str(s):
return s

def native_str(s, replace=False):
# Replace is ignored -- unicode to UTF-8 can't fail
if isinstance(s, text_type):
Expand Down
28 changes: 26 additions & 2 deletions tests/unit/test_compat.py
@@ -1,7 +1,8 @@
import locale
import os

import pip.compat
import pytest
from pip.compat import expanduser, get_path_uid, native_str
from pip.compat import expanduser, get_path_uid, native_str, console_to_str


def test_get_path_uid():
Expand Down Expand Up @@ -40,6 +41,29 @@ def test_get_path_uid_symlink_without_NOFOLLOW(tmpdir, monkeypatch):
get_path_uid(fs)


def test_console_to_str(monkeypatch):
some_bytes = b"a\xE9\xC3\xE9b"
encodings = ('ascii', 'utf-8', 'iso-8859-1', 'iso-8859-5',
'koi8_r', 'cp850')
for e in encodings:
monkeypatch.setattr(locale, 'getpreferredencoding', lambda: e)
result = console_to_str(some_bytes)
assert result.startswith("a")
assert result.endswith("b")


def test_console_to_str_warning(monkeypatch):
some_bytes = b"a\xE9b"

def check_warning(msg):
assert msg.startswith(
"Subprocess output does not appear to be encoded as")

monkeypatch.setattr(locale, 'getpreferredencoding', lambda: 'utf-8')
monkeypatch.setattr(pip.compat.logger, 'warning', check_warning)
console_to_str(some_bytes)


def test_to_native_str_type():
some_bytes = b"test\xE9 et approuv\xC3\xE9"
some_unicode = b"test\xE9 et approuv\xE9".decode('iso-8859-15')
Expand Down

1 comment on commit 8a10132

@Rashid-S
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm new to Python. I encountered the same problem "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xcd in position 24: invalid continuation byte". Tell me, do I need to fix four files, as indicated here?

Please sign in to comment.