From 6776d0228053103ba262963635f3eba5b76e6471 Mon Sep 17 00:00:00 2001 From: Rob Malouf Date: Thu, 18 May 2017 15:44:39 -0700 Subject: [PATCH 1/2] backport TextIOWrapper improvements from 3.3 to 2.7 --- Lib/_pyio.py | 60 +++++++++++++++--- Modules/_io/textio.c | 144 +++++++++++++++++++++++++++++++------------ 2 files changed, 155 insertions(+), 49 deletions(-) mode change 100644 => 100755 Lib/_pyio.py mode change 100644 => 100755 Modules/_io/textio.c diff --git a/Lib/_pyio.py b/Lib/_pyio.py old mode 100644 new mode 100755 index f022a4e88b8ed7..119a75eaee9be8 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1523,6 +1523,7 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None, self._decoded_chars_used = 0 # offset into _decoded_chars for read() self._snapshot = None # info for reconstructing decoder state self._seekable = self._telling = self.buffer.seekable() + self._b2cratio = 0.0 if self._seekable and self.writable(): position = self.buffer.tell() @@ -1637,7 +1638,7 @@ def _get_decoder(self): self._decoder = decoder return decoder - # The following three methods implement an ADT for _decoded_chars. + # The following three methods implement an ADsT for _decoded_chars. # Text returned from the decoder is buffered here until the client # requests it by calling our read() or readline() method. def _set_decoded_chars(self, chars): @@ -1686,8 +1687,12 @@ def _read_chunk(self): # Read a chunk, decode it, and put the result in self._decoded_chars. input_chunk = self.buffer.read1(self._CHUNK_SIZE) eof = not input_chunk - self._set_decoded_chars(self._decoder.decode(input_chunk, eof)) - + decoded_chars = self._decoder.decode(input_chunk, eof) + self._set_decoded_chars(decoded_chars) + if decoded_chars: + self._b2cratio = len(input_chunk) / len(self._decoded_chars) + else: + self._b2cratio = 0.0 if self._telling: # At the snapshot point, len(dec_buffer) bytes before the read, # the next input to be decoded is dec_buffer + input_chunk. @@ -1740,19 +1745,56 @@ def tell(self): # forward until it gives us enough decoded characters. saved_state = decoder.getstate() try: + # Fast search for an acceptable start point, close to our + # current pos. + # Rationale: calling decoder.decode() has a large overhead + # regardless of chunk size; we want the number of such calls to + # be O(1) in most situations (common decoders, non-crazy input). + # Actually, it will be exactly 1 for fixed-size codecs (all + # 8-bit codecs, also UTF-16 and UTF-32). + skip_bytes = int(self._b2cratio * chars_to_skip) + skip_back = 1 + assert skip_bytes <= len(next_input) + while skip_bytes > 0: + decoder.setstate((b'', dec_flags)) + # Decode up to temptative start point + n = len(decoder.decode(next_input[:skip_bytes])) + if n <= chars_to_skip: + b, d = decoder.getstate() + if not b: + # Before pos and no bytes buffered in decoder => OK + dec_flags = d + chars_to_skip -= n + break + # Skip back by buffered amount and reset heuristic + skip_bytes -= len(b) + skip_back = 1 + else: + # We're too far ahead, skip back a bit + skip_bytes -= skip_back + skip_back = skip_back * 2 + else: + skip_bytes = 0 + decoder.setstate((b'', dec_flags)) + # Note our initial start point. - decoder.setstate((b'', dec_flags)) - start_pos = position - start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0 - need_eof = 0 + start_pos = position + skip_bytes + start_flags = dec_flags + if chars_to_skip == 0: + # We haven't moved from the start point. + return self._pack_cookie(start_pos, start_flags) # Feed the decoder one byte at a time. As we go, note the # nearest "safe start point" before the current location # (a point where the decoder has nothing buffered, so seek() # can safely start from there and advance to this location). - for next_byte in next_input: + bytes_fed = 0 + need_eof = 0 + # Chars decoded since `start_pos` + chars_decoded = 0 + for i in range(skip_bytes, len(next_input)): bytes_fed += 1 - chars_decoded += len(decoder.decode(next_byte)) + chars_decoded += len(decoder.decode(next_input[i:i+1])) dec_buffer, dec_flags = decoder.getstate() if not dec_buffer and chars_decoded <= chars_to_skip: # Decoder buffer is empty, so this is a safe start point. diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c old mode 100644 new mode 100755 index f13dcb4366e06f..1330119d74505a --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -693,12 +693,15 @@ typedef struct PyObject *pending_bytes; /* list of bytes objects waiting to be written, or NULL */ Py_ssize_t pending_bytes_count; - PyObject *snapshot; /* snapshot is either None, or a tuple (dec_flags, next_input) where * dec_flags is the second (integer) item of the decoder state and * next_input is the chunk of input bytes that comes next after the * snapshot point. We use this to reconstruct decoder states in tell(). */ + PyObject *snapshot; + /* Bytes-to-characters ratio for the current chunk. Serves as input for + the heuristic in tell(). */ + double b2cratio; /* Cache raw object if it's a FileIO object */ PyObject *raw; @@ -865,6 +868,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) self->pending_bytes_count = 0; self->encodefunc = NULL; self->writenl = NULL; + self->b2cratio = 0.0; if (encoding == NULL && self->encoding == NULL) { if (_PyIO_locale_module == NULL) { @@ -1391,6 +1395,7 @@ textiowrapper_read_chunk(textio *self) PyObject *dec_flags = NULL; PyObject *input_chunk = NULL; PyObject *decoded_chars, *chunk_size; + Py_ssize_t nbytes, nchars; int eof; /* The return value is True unless EOF was reached. The decoded string is @@ -1441,7 +1446,8 @@ textiowrapper_read_chunk(textio *self) goto fail; } - eof = (PyBytes_Size(input_chunk) == 0); + nbytes = PyBytes_Size(input_chunk); + eof = (nbytes == 0); if (Py_TYPE(self->decoder) == &PyIncrementalNewlineDecoder_Type) { decoded_chars = _PyIncrementalNewlineDecoder_decode( @@ -1455,7 +1461,12 @@ textiowrapper_read_chunk(textio *self) if (check_decoded(decoded_chars) < 0) goto fail; textiowrapper_set_decoded_chars(self, decoded_chars); - if (PyUnicode_GET_SIZE(decoded_chars) > 0) + nchars = PyUnicode_GET_SIZE(decoded_chars); + if (nchars > 0) + self->b2cratio = (double) nbytes / nchars; + else + self->b2cratio = 0.0; + if (nchars > 0) eof = 0; if (self->telling) { @@ -2191,8 +2202,11 @@ textiowrapper_tell(textio *self, PyObject *args) cookie_type cookie = {0,0,0,0,0}; PyObject *next_input; Py_ssize_t chars_to_skip, chars_decoded; + Py_ssize_t skip_bytes, skip_back; PyObject *saved_state = NULL; char *input, *input_end; + Py_ssize_t dec_buffer_len; + int dec_flags; CHECK_ATTACHED(self); CHECK_CLOSED(self); @@ -2229,11 +2243,12 @@ textiowrapper_tell(textio *self, PyObject *args) #else cookie.start_pos = PyLong_AsLong(posobj); #endif + Py_DECREF(posobj); if (PyErr_Occurred()) goto fail; /* Skip backward to the snapshot point (see _read_chunk). */ - if (!PyArg_Parse(self->snapshot, "(iO)", &cookie.dec_flags, &next_input)) + if (!PyArg_ParseTuple(self->snapshot, "iO", &cookie.dec_flags, &next_input)) goto fail; assert (PyBytes_Check(next_input)); @@ -2243,56 +2258,107 @@ textiowrapper_tell(textio *self, PyObject *args) /* How many decoded characters have been used up since the snapshot? */ if (self->decoded_chars_used == 0) { /* We haven't moved from the snapshot point. */ - Py_DECREF(posobj); return textiowrapper_build_cookie(&cookie); } chars_to_skip = self->decoded_chars_used; - /* Starting from the snapshot position, we will walk the decoder - * forward until it gives us enough decoded characters. - */ + /* Decoder state will be restored at the end */ saved_state = PyObject_CallMethodObjArgs(self->decoder, _PyIO_str_getstate, NULL); if (saved_state == NULL) goto fail; - /* Note our initial start point. */ - if (_textiowrapper_decoder_setstate(self, &cookie) < 0) - goto fail; +#define DECODER_GETSTATE() do { \ + char *dec_buffer; \ + PyObject *_state = PyObject_CallMethodObjArgs(self->decoder, \ + _PyIO_str_getstate, NULL); \ + if (_state == NULL) \ + goto fail; \ + if (!PyArg_ParseTuple(_state, "Oi", &dec_buffer, &dec_flags)) { \ + Py_DECREF(_state); \ + goto fail; \ + } \ + if (!PyBytes_Check(dec_buffer)) { \ + PyErr_Format(PyExc_TypeError, \ + "decoder getstate() should have returned a bytes " \ + "object, not '%.200s'", \ + Py_TYPE(dec_buffer)->tp_name); \ + Py_DECREF(_state); \ + goto fail; \ + } \ + dec_buffer_len = PyBytes_GET_SIZE(dec_buffer); \ + Py_DECREF(_state); \ + } while (0) + +#define DECODER_DECODE(start, len, res) do { \ + PyObject *_decoded = PyObject_CallMethod( \ + self->decoder, "decode", "s#", start, len); \ + if (check_decoded(_decoded) < 0) \ + goto fail; \ + res = PyUnicode_GET_SIZE(_decoded); \ + Py_DECREF(_decoded); \ + } while (0) + + /* Fast search for an acceptable start point, close to our + current pos */ + skip_bytes = (Py_ssize_t) (self->b2cratio * chars_to_skip); + skip_back = 1; + assert(skip_back <= PyBytes_GET_SIZE(next_input)); + input = PyBytes_AS_STRING(next_input); + while (skip_bytes > 0) { + /* Decode up to temptative start point */ + if (_textiowrapper_decoder_setstate(self, &cookie) < 0) + goto fail; + DECODER_DECODE(input, skip_bytes, chars_decoded); + if (chars_decoded <= chars_to_skip) { + DECODER_GETSTATE(); + if (dec_buffer_len == 0) { + /* Before pos and no bytes buffered in decoder => OK */ + cookie.dec_flags = dec_flags; + chars_to_skip -= chars_decoded; + break; + } + /* Skip back by buffered amount and reset heuristic */ + skip_bytes -= dec_buffer_len; + skip_back = 1; + } + else { + /* We're too far ahead, skip back a bit */ + skip_bytes -= skip_back; + skip_back *= 2; + } + } + if (skip_bytes <= 0) { + skip_bytes = 0; + if (_textiowrapper_decoder_setstate(self, &cookie) < 0) + goto fail; + } - /* Feed the decoder one byte at a time. As we go, note the - * nearest "safe start point" before the current location - * (a point where the decoder has nothing buffered, so seek() + /* Note our initial start point. */ + cookie.start_pos += skip_bytes; + cookie.chars_to_skip = Py_SAFE_DOWNCAST(chars_to_skip, Py_ssize_t, int); + if (chars_to_skip == 0) + goto finally; + + /* We should be close to the desired position. Now feed the decoder one + * byte at a time until we reach the `chars_to_skip` target. + * As we go, note the nearest "safe start point" before the current + * location (a point where the decoder has nothing buffered, so seek() * can safely start from there and advance to this location). */ chars_decoded = 0; input = PyBytes_AS_STRING(next_input); input_end = input + PyBytes_GET_SIZE(next_input); + input += skip_bytes; while (input < input_end) { - PyObject *state; - char *dec_buffer; - Py_ssize_t dec_buffer_len; - int dec_flags; - - PyObject *decoded = PyObject_CallMethod( - self->decoder, "decode", "s#", input, (Py_ssize_t)1); - if (check_decoded(decoded) < 0) - goto fail; - chars_decoded += PyUnicode_GET_SIZE(decoded); - Py_DECREF(decoded); + Py_ssize_t n; + DECODER_DECODE(input, (Py_ssize_t)1, n); + /* We got n chars for 1 byte */ + chars_decoded += n; cookie.bytes_to_feed += 1; - - state = PyObject_CallMethodObjArgs(self->decoder, - _PyIO_str_getstate, NULL); - if (state == NULL) - goto fail; - if (!PyArg_Parse(state, "(s#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) { - Py_DECREF(state); - goto fail; - } - Py_DECREF(state); + DECODER_GETSTATE(); if (dec_buffer_len == 0 && chars_decoded <= chars_to_skip) { /* Decoder buffer is empty, so this is a safe start point. */ @@ -2323,8 +2389,8 @@ textiowrapper_tell(textio *self, PyObject *args) } } - /* finally */ - Py_XDECREF(posobj); + +finally: res = PyObject_CallMethod(self->decoder, "setstate", "(O)", saved_state); Py_DECREF(saved_state); if (res == NULL) @@ -2335,12 +2401,10 @@ textiowrapper_tell(textio *self, PyObject *args) cookie.chars_to_skip = Py_SAFE_DOWNCAST(chars_to_skip, Py_ssize_t, int); return textiowrapper_build_cookie(&cookie); - fail: - Py_XDECREF(posobj); +fail: if (saved_state) { PyObject *type, *value, *traceback; PyErr_Fetch(&type, &value, &traceback); - res = PyObject_CallMethod(self->decoder, "setstate", "(O)", saved_state); _PyErr_ReplaceException(type, value, traceback); Py_DECREF(saved_state); From ca7b013f7877ea444322f283d615f8b616f132f0 Mon Sep 17 00:00:00 2001 From: Rob Malouf Date: Mon, 22 May 2017 17:11:07 -0700 Subject: [PATCH 2/2] fix typo --- Lib/_pyio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 119a75eaee9be8..86fb928ae786b9 100755 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1638,7 +1638,7 @@ def _get_decoder(self): self._decoder = decoder return decoder - # The following three methods implement an ADsT for _decoded_chars. + # The following three methods implement an ADT for _decoded_chars. # Text returned from the decoder is buffered here until the client # requests it by calling our read() or readline() method. def _set_decoded_chars(self, chars):