@@ -32,6 +32,11 @@ typedef struct
3232{
3333 PyObject_HEAD struct tok_state * tok ;
3434 int done ;
35+
36+ /* Needed to cache line for performance */
37+ PyObject * last_line ;
38+ Py_ssize_t last_lineno ;
39+ Py_ssize_t byte_col_offset_diff ;
3540} tokenizeriterobject ;
3641
3742/*[clinic input]
@@ -68,6 +73,11 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
6873 self -> tok -> tok_extra_tokens = 1 ;
6974 }
7075 self -> done = 0 ;
76+
77+ self -> last_line = NULL ;
78+ self -> byte_col_offset_diff = 0 ;
79+ self -> last_lineno = 0 ;
80+
7181 return (PyObject * )self ;
7282}
7383
@@ -210,7 +220,18 @@ tokenizeriter_next(tokenizeriterobject *it)
210220 if (size >= 1 && it -> tok -> implicit_newline ) {
211221 size -= 1 ;
212222 }
213- line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
223+
224+ if (it -> tok -> lineno != it -> last_lineno ) {
225+ // Line has changed since last token, so we fetch the new line and cache it
226+ // in the iter object.
227+ Py_XDECREF (it -> last_line );
228+ line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
229+ it -> last_line = line ;
230+ it -> byte_col_offset_diff = 0 ;
231+ } else {
232+ // Line hasn't changed so we reuse the cached one.
233+ line = it -> last_line ;
234+ }
214235 }
215236 if (line == NULL ) {
216237 Py_DECREF (str );
@@ -219,13 +240,28 @@ tokenizeriter_next(tokenizeriterobject *it)
219240
220241 Py_ssize_t lineno = ISSTRINGLIT (type ) ? it -> tok -> first_lineno : it -> tok -> lineno ;
221242 Py_ssize_t end_lineno = it -> tok -> lineno ;
243+ it -> last_lineno = lineno ;
244+
222245 Py_ssize_t col_offset = -1 ;
223246 Py_ssize_t end_col_offset = -1 ;
247+ Py_ssize_t byte_offset = -1 ;
224248 if (token .start != NULL && token .start >= line_start ) {
225- col_offset = _PyPegen_byte_offset_to_character_offset (line , token .start - line_start );
249+ byte_offset = token .start - line_start ;
250+ col_offset = byte_offset - it -> byte_col_offset_diff ;
226251 }
227252 if (token .end != NULL && token .end >= it -> tok -> line_start ) {
228- end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , token .end - it -> tok -> line_start );
253+ Py_ssize_t end_byte_offset = token .end - it -> tok -> line_start ;
254+ if (lineno == end_lineno ) {
255+ // If the whole token is at the same line, we can just use the token.start
256+ // buffer for figuring out the new column offset, since using line is not
257+ // performant for very long lines.
258+ Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line (line , byte_offset , end_byte_offset );
259+ end_col_offset = col_offset + token_col_offset ;
260+ it -> byte_col_offset_diff += token .end - token .start - token_col_offset ;
261+ } else {
262+ end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , end_byte_offset );
263+ it -> byte_col_offset_diff += end_byte_offset - end_col_offset ;
264+ }
229265 }
230266
231267 if (it -> tok -> tok_extra_tokens ) {
@@ -262,7 +298,7 @@ tokenizeriter_next(tokenizeriterobject *it)
262298 }
263299 }
264300
265- result = Py_BuildValue ("(iN(nn)(nn)N )" , type , str , lineno , col_offset , end_lineno , end_col_offset , line );
301+ result = Py_BuildValue ("(iN(nn)(nn)O )" , type , str , lineno , col_offset , end_lineno , end_col_offset , line );
266302exit :
267303 _PyToken_Free (& token );
268304 if (type == ENDMARKER ) {
0 commit comments