From cba083646baa7067640812ce8661370743275f89 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 10 Apr 2022 17:48:19 +0900 Subject: [PATCH 1/5] bpo-47000: Fix `encoding="locale"` in UTF-8 mode --- Lib/_pyio.py | 6 ++++-- Modules/_io/textio.c | 11 ++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index e3ff59eb1adb19..ad8805c4000dd1 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -2021,7 +2021,9 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None, self._check_newline(newline) encoding = text_encoding(encoding) - if encoding == "locale": + if encoding == "locale" and sys.platform == "win32": + # On Unix, os.device_encoding() returns "utf-8" instead of locale encoding + # in the UTF-8 mode. So we use os.device_encoding() only on Windows. try: encoding = os.device_encoding(buffer.fileno()) or "locale" except (AttributeError, UnsupportedOperation): @@ -2034,7 +2036,7 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None, # Importing locale may fail if Python is being built encoding = "utf-8" else: - encoding = locale.getpreferredencoding(False) + encoding = locale.getencoding() if not isinstance(encoding, str): raise ValueError("invalid encoding: %r" % encoding) diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 0e207413257f49..67ae3deded4968 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1061,6 +1061,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, _PyIO_State *state = NULL; PyObject *res; int r; + int use_locale_encoding = 0; // Use locale encoding even in UTF-8 mode. self->ok = 0; self->detached = 0; @@ -1076,6 +1077,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, } else if (strcmp(encoding, "locale") == 0) { encoding = NULL; + use_locale_encoding = 1; } if (errors == Py_None) { @@ -1113,6 +1115,11 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, self->encodefunc = NULL; self->b2cratio = 0.0; +#ifdef MS_WINDOWS + // os.device_encoding() on Unix is the locale encoding or UTF-8 + // according to UTF-8 Mode. + // Since UTF-8 mode shouldn't affect `encoding="locale"`, we call + // os.device_encoding() only on Windows. if (encoding == NULL) { /* Try os.device_encoding(fileno) */ PyObject *fileno; @@ -1144,8 +1151,10 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, Py_CLEAR(self->encoding); } } +#endif + if (encoding == NULL && self->encoding == NULL) { - if (_PyRuntime.preconfig.utf8_mode) { + if (_PyRuntime.preconfig.utf8_mode && !use_locale_encoding) { _Py_DECLARE_STR(utf_8, "utf-8"); self->encoding = Py_NewRef(&_Py_STR(utf_8)); } From 2e360d501c76a88ffa4aaa179f52279613dfede4 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 10 Apr 2022 17:50:20 +0900 Subject: [PATCH 2/5] Add NEWS --- .../next/Library/2022-04-10-17-50-18.bpo-47000.JlQkFx.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2022-04-10-17-50-18.bpo-47000.JlQkFx.rst diff --git a/Misc/NEWS.d/next/Library/2022-04-10-17-50-18.bpo-47000.JlQkFx.rst b/Misc/NEWS.d/next/Library/2022-04-10-17-50-18.bpo-47000.JlQkFx.rst new file mode 100644 index 00000000000000..77d5b8f33d9a0c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-10-17-50-18.bpo-47000.JlQkFx.rst @@ -0,0 +1,2 @@ +Make :class:`TextIOWrapper` uses locale encoding when ``encoding="locale"`` +is specified even in UTF-8 mode. From da02718aa79c48f397efc481d3561451feb5556a Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Mon, 11 Apr 2022 12:08:38 +0900 Subject: [PATCH 3/5] Fix unused variable --- Modules/_io/textio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 67ae3deded4968..d2fd2265c41885 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1058,7 +1058,6 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, /*[clinic end generated code: output=72267c0c01032ed2 input=77d8696d1a1f460b]*/ { PyObject *raw, *codec_info = NULL; - _PyIO_State *state = NULL; PyObject *res; int r; int use_locale_encoding = 0; // Use locale encoding even in UTF-8 mode. @@ -1123,7 +1122,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, if (encoding == NULL) { /* Try os.device_encoding(fileno) */ PyObject *fileno; - state = IO_STATE(); + _PyIO_State *state = IO_STATE(); if (state == NULL) goto error; fileno = PyObject_CallMethodNoArgs(buffer, &_Py_ID(fileno)); From 713e9f44a9a6eec568e6804ffdfa947f4385df5c Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Mon, 11 Apr 2022 13:34:08 +0900 Subject: [PATCH 4/5] Skip test_device_encoding --- Lib/test/test_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 29fe287550b2d0..c86251dfe5734c 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -2737,6 +2737,7 @@ def test_default_encoding(self): os.environ.update(old_environ) @support.cpython_only + @unittest.skipIf(sys.platform != "win32", "Windows-only test") @unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled") def test_device_encoding(self): # Issue 15989 From 686490e75342d88134e1906c131a9e78bb860c72 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 12 Apr 2022 12:46:38 +0900 Subject: [PATCH 5/5] Update docs --- Doc/library/io.rst | 5 ++--- Doc/using/windows.rst | 2 +- Lib/_pyio.py | 2 +- Lib/locale.py | 2 +- Modules/_io/_iomodule.c | 8 ++++---- Modules/_io/clinic/_iomodule.c.h | 8 ++++---- Modules/_io/clinic/textio.c.h | 4 ++-- Modules/_io/textio.c | 4 ++-- Tools/c-analyzer/TODO | 1 - 9 files changed, 17 insertions(+), 19 deletions(-) diff --git a/Doc/library/io.rst b/Doc/library/io.rst index 80107d539505c0..53dad99fa1dbc3 100644 --- a/Doc/library/io.rst +++ b/Doc/library/io.rst @@ -112,7 +112,7 @@ Text Encoding ------------- The default encoding of :class:`TextIOWrapper` and :func:`open` is -locale-specific (:func:`locale.getpreferredencoding(False) `). +locale-specific (:func:`locale.getencoding`). However, many developers forget to specify the encoding when opening text files encoded in UTF-8 (e.g. JSON, TOML, Markdown, etc...) since most Unix @@ -948,8 +948,7 @@ Text I/O :class:`TextIOBase`. *encoding* gives the name of the encoding that the stream will be decoded or - encoded with. It defaults to - :func:`locale.getpreferredencoding(False) `. + encoded with. It defaults to :func:`locale.getencoding()`. ``encoding="locale"`` can be used to specify the current locale's encoding explicitly. See :ref:`io-text-encoding` for more information. diff --git a/Doc/using/windows.rst b/Doc/using/windows.rst index 83eee281d4e5c8..88dcb002e2c249 100644 --- a/Doc/using/windows.rst +++ b/Doc/using/windows.rst @@ -618,7 +618,7 @@ UTF-8 mode Windows still uses legacy encodings for the system encoding (the ANSI Code Page). Python uses it for the default encoding of text files (e.g. -:func:`locale.getpreferredencoding`). +:func:`locale.getencoding`). This may cause issues because UTF-8 is widely used on the internet and most Unix systems, including WSL (Windows Subsystem for Linux). diff --git a/Lib/_pyio.py b/Lib/_pyio.py index ad8805c4000dd1..0f33ed59492e71 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1988,7 +1988,7 @@ class TextIOWrapper(TextIOBase): r"""Character and line based layer over a BufferedIOBase object, buffer. encoding gives the name of the encoding that the stream will be - decoded or encoded with. It defaults to locale.getpreferredencoding(False). + decoded or encoded with. It defaults to locale.getencoding(). errors determines the strictness of encoding and decoding (see the codecs.register) and defaults to "strict". diff --git a/Lib/locale.py b/Lib/locale.py index 496cc803c88f7c..170e5eea45b8ca 100644 --- a/Lib/locale.py +++ b/Lib/locale.py @@ -557,7 +557,7 @@ def getdefaultlocale(envvars=('LC_ALL', 'LC_CTYPE', 'LANG', 'LANGUAGE')): import warnings warnings.warn( - "Use setlocale(), getpreferredencoding(False) and getlocale() instead", + "Use setlocale(), getencoding() and getlocale() instead", DeprecationWarning, stacklevel=2 ) diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c index 065f5e29c315bf..38ef24637b7318 100644 --- a/Modules/_io/_iomodule.c +++ b/Modules/_io/_iomodule.c @@ -92,9 +92,9 @@ it already exists), 'x' for creating and writing to a new file, and 'a' for appending (which on some Unix systems, means that all writes append to the end of the file regardless of the current seek position). In text mode, if encoding is not specified the encoding used is platform -dependent: locale.getpreferredencoding(False) is called to get the -current locale encoding. (For reading and writing raw bytes use binary -mode and leave encoding unspecified.) The available modes are: +dependent: locale.getencoding() is called to get the current locale encoding. +(For reading and writing raw bytes use binary mode and leave encoding +unspecified.) The available modes are: ========= =============================================================== Character Meaning @@ -196,7 +196,7 @@ static PyObject * _io_open_impl(PyObject *module, PyObject *file, const char *mode, int buffering, const char *encoding, const char *errors, const char *newline, int closefd, PyObject *opener) -/*[clinic end generated code: output=aefafc4ce2b46dc0 input=1543f4511d2356a5]*/ +/*[clinic end generated code: output=aefafc4ce2b46dc0 input=5bb37f174cb2fb11]*/ { unsigned i; diff --git a/Modules/_io/clinic/_iomodule.c.h b/Modules/_io/clinic/_iomodule.c.h index e4a6b8c42e1d84..1fdbe6835c7175 100644 --- a/Modules/_io/clinic/_iomodule.c.h +++ b/Modules/_io/clinic/_iomodule.c.h @@ -22,9 +22,9 @@ PyDoc_STRVAR(_io_open__doc__, "\'a\' for appending (which on some Unix systems, means that all writes\n" "append to the end of the file regardless of the current seek position).\n" "In text mode, if encoding is not specified the encoding used is platform\n" -"dependent: locale.getpreferredencoding(False) is called to get the\n" -"current locale encoding. (For reading and writing raw bytes use binary\n" -"mode and leave encoding unspecified.) The available modes are:\n" +"dependent: locale.getencoding() is called to get the current locale encoding.\n" +"(For reading and writing raw bytes use binary mode and leave encoding\n" +"unspecified.) The available modes are:\n" "\n" "========= ===============================================================\n" "Character Meaning\n" @@ -355,4 +355,4 @@ _io_open_code(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec exit: return return_value; } -/*[clinic end generated code: output=1a7fd7755c9a9609 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=e562f29e3c2533a6 input=a9049054013a1b77]*/ diff --git a/Modules/_io/clinic/textio.c.h b/Modules/_io/clinic/textio.c.h index 0b047ac0aab4eb..7e81eb370fab40 100644 --- a/Modules/_io/clinic/textio.c.h +++ b/Modules/_io/clinic/textio.c.h @@ -146,7 +146,7 @@ PyDoc_STRVAR(_io_TextIOWrapper___init____doc__, "Character and line based layer over a BufferedIOBase object, buffer.\n" "\n" "encoding gives the name of the encoding that the stream will be\n" -"decoded or encoded with. It defaults to locale.getpreferredencoding(False).\n" +"decoded or encoded with. It defaults to locale.getencoding().\n" "\n" "errors determines the strictness of encoding and decoding (see\n" "help(codecs.Codec) or the documentation for codecs.register) and\n" @@ -671,4 +671,4 @@ _io_TextIOWrapper_close(textio *self, PyObject *Py_UNUSED(ignored)) { return _io_TextIOWrapper_close_impl(self); } -/*[clinic end generated code: output=2604c8f3a45b9a03 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=e88abad34e31c0cb input=a9049054013a1b77]*/ diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index d2fd2265c41885..6ba7393c3a6a30 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1023,7 +1023,7 @@ _io.TextIOWrapper.__init__ Character and line based layer over a BufferedIOBase object, buffer. encoding gives the name of the encoding that the stream will be -decoded or encoded with. It defaults to locale.getpreferredencoding(False). +decoded or encoded with. It defaults to locale.getencoding(). errors determines the strictness of encoding and decoding (see help(codecs.Codec) or the documentation for codecs.register) and @@ -1055,7 +1055,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer, const char *encoding, PyObject *errors, const char *newline, int line_buffering, int write_through) -/*[clinic end generated code: output=72267c0c01032ed2 input=77d8696d1a1f460b]*/ +/*[clinic end generated code: output=72267c0c01032ed2 input=72590963698f289b]*/ { PyObject *raw, *codec_info = NULL; PyObject *res; diff --git a/Tools/c-analyzer/TODO b/Tools/c-analyzer/TODO index 55338ebc855d09..6683df5993b8c0 100644 --- a/Tools/c-analyzer/TODO +++ b/Tools/c-analyzer/TODO @@ -251,7 +251,6 @@ Modules/_io/textio.c:PyId_close _Py_IDENTIFIER( Modules/_io/textio.c:PyId_decode _Py_IDENTIFIER(decode) Modules/_io/textio.c:PyId_fileno _Py_IDENTIFIER(fileno) Modules/_io/textio.c:PyId_flush _Py_IDENTIFIER(flush) -Modules/_io/textio.c:PyId_getpreferredencoding _Py_IDENTIFIER(getpreferredencoding) Modules/_io/textio.c:PyId_isatty _Py_IDENTIFIER(isatty) Modules/_io/textio.c:PyId_mode _Py_IDENTIFIER(mode) Modules/_io/textio.c:PyId_name _Py_IDENTIFIER(name)