Skip to content

Commit

Permalink
gh-81283: compiler: remove indent from docstring (#106411)
Browse files Browse the repository at this point in the history
Co-authored-by: Éric <merwok@netwok.org>
  • Loading branch information
methane and merwok committed Jul 15, 2023
1 parent bbf6297 commit 2566b74
Show file tree
Hide file tree
Showing 9 changed files with 246 additions and 30 deletions.
7 changes: 7 additions & 0 deletions Doc/whatsnew/3.13.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,13 @@ Other Language Changes
* Allow the *count* argument of :meth:`str.replace` to be a keyword.
(Contributed by Hugo van Kemenade in :gh:`106487`.)

* Compiler now strip indents from docstrings.
This will reduce the size of :term:`bytecode cache <bytecode>` (e.g. ``.pyc`` file).
For example, cache file size for ``sqlalchemy.orm.session`` in SQLAlchemy 2.0
is reduced by about 5%.
This change will affect tools using docstrings, like :mod:`doctest`.
(Contributed by Inada Naoki in :gh:`81283`.)

New Modules
===========

Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_compile.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ int _PyCompile_ConstCacheMergeOne(PyObject *const_cache, PyObject **obj);

/* Access compiler internals for unit testing */

PyAPI_FUNC(PyObject*) _PyCompile_CleanDoc(PyObject *doc);

PyAPI_FUNC(PyObject*) _PyCompile_CodeGen(
PyObject *ast,
PyObject *filename,
Expand Down
45 changes: 22 additions & 23 deletions Lib/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,29 +881,28 @@ def cleandoc(doc):
Any whitespace that can be uniformly removed from the second line
onwards is removed."""
try:
lines = doc.expandtabs().split('\n')
except UnicodeError:
return None
else:
# Find minimum indentation of any non-blank lines after first line.
margin = sys.maxsize
for line in lines[1:]:
content = len(line.lstrip())
if content:
indent = len(line) - content
margin = min(margin, indent)
# Remove indentation.
if lines:
lines[0] = lines[0].lstrip()
if margin < sys.maxsize:
for i in range(1, len(lines)): lines[i] = lines[i][margin:]
# Remove any trailing or leading blank lines.
while lines and not lines[-1]:
lines.pop()
while lines and not lines[0]:
lines.pop(0)
return '\n'.join(lines)
lines = doc.expandtabs().split('\n')

# Find minimum indentation of any non-blank lines after first line.
margin = sys.maxsize
for line in lines[1:]:
content = len(line.lstrip(' '))
if content:
indent = len(line) - content
margin = min(margin, indent)
# Remove indentation.
if lines:
lines[0] = lines[0].lstrip(' ')
if margin < sys.maxsize:
for i in range(1, len(lines)):
lines[i] = lines[i][margin:]
# Remove any trailing or leading blank lines.
while lines and not lines[-1]:
lines.pop()
while lines and not lines[0]:
lines.pop(0)
return '\n'.join(lines)


def getfile(object):
"""Work out which source or compiled file an object was defined in."""
Expand Down
4 changes: 2 additions & 2 deletions Lib/test/test_doctest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1287,14 +1287,14 @@ def optionflags(): r"""
treated as equal:
>>> def f(x):
... '>>> print(1, 2, 3)\n 1 2\n 3'
... '\n>>> print(1, 2, 3)\n 1 2\n 3'
>>> # Without the flag:
>>> test = doctest.DocTestFinder().find(f)[0]
>>> doctest.DocTestRunner(verbose=False).run(test)
... # doctest: +ELLIPSIS
**********************************************************************
File ..., line 2, in f
File ..., line 3, in f
Failed example:
print(1, 2, 3)
Expected:
Expand Down
35 changes: 33 additions & 2 deletions Lib/test/test_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,9 +596,40 @@ def test_finddoc(self):
self.assertEqual(finddoc(int.from_bytes), int.from_bytes.__doc__)
self.assertEqual(finddoc(int.real), int.real.__doc__)

cleandoc_testdata = [
# first line should have different margin
(' An\n indented\n docstring.', 'An\nindented\n docstring.'),
# trailing whitespace are not removed.
(' An \n \n indented \n docstring. ',
'An \n \nindented \n docstring. '),
# NUL is not termination.
('doc\0string\n\n second\0line\n third\0line\0',
'doc\0string\n\nsecond\0line\nthird\0line\0'),
# first line is lstrip()-ped. other lines are kept when no margin.[w:
(' ', ''),
# compiler.cleandoc() doesn't strip leading/trailing newlines
# to keep maximum backward compatibility.
# inspect.cleandoc() removes them.
('\n\n\n first paragraph\n\n second paragraph\n\n',
'\n\n\nfirst paragraph\n\n second paragraph\n\n'),
(' \n \n \n ', '\n \n \n '),
]

def test_cleandoc(self):
self.assertEqual(inspect.cleandoc('An\n indented\n docstring.'),
'An\nindented\ndocstring.')
func = inspect.cleandoc
for i, (input, expected) in enumerate(self.cleandoc_testdata):
# only inspect.cleandoc() strip \n
expected = expected.strip('\n')
with self.subTest(i=i):
self.assertEqual(func(input), expected)

@cpython_only
def test_c_cleandoc(self):
import _testinternalcapi
func = _testinternalcapi.compiler_cleandoc
for i, (input, expected) in enumerate(self.cleandoc_testdata):
with self.subTest(i=i):
self.assertEqual(func(input), expected)

def test_getcomments(self):
self.assertEqual(inspect.getcomments(mod), '# line 1\n')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Compiler now strips indents from docstrings. It reduces ``pyc`` file size 5%
when the module is heavily documented. This change affects to ``__doc__`` so
tools like doctest will be affected.
20 changes: 19 additions & 1 deletion Modules/_testinternalcapi.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "pycore_atomic_funcs.h" // _Py_atomic_int_get()
#include "pycore_bitutils.h" // _Py_bswap32()
#include "pycore_bytesobject.h" // _PyBytes_Find()
#include "pycore_compile.h" // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble
#include "pycore_compile.h" // _PyCompile_CodeGen, _PyCompile_OptimizeCfg, _PyCompile_Assemble, _PyCompile_CleanDoc
#include "pycore_ceval.h" // _PyEval_AddPendingCall
#include "pycore_fileutils.h" // _Py_normpath
#include "pycore_frame.h" // _PyInterpreterFrame
Expand Down Expand Up @@ -704,6 +704,23 @@ set_eval_frame_record(PyObject *self, PyObject *list)
Py_RETURN_NONE;
}

/*[clinic input]
_testinternalcapi.compiler_cleandoc -> object
doc: unicode
C implementation of inspect.cleandoc().
[clinic start generated code]*/

static PyObject *
_testinternalcapi_compiler_cleandoc_impl(PyObject *module, PyObject *doc)
/*[clinic end generated code: output=2dd203a80feff5bc input=2de03fab931d9cdc]*/
{
return _PyCompile_CleanDoc(doc);
}


/*[clinic input]
_testinternalcapi.compiler_codegen -> object
Expand Down Expand Up @@ -1448,6 +1465,7 @@ static PyMethodDef module_functions[] = {
{"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
{"set_eval_frame_default", set_eval_frame_default, METH_NOARGS, NULL},
{"set_eval_frame_record", set_eval_frame_record, METH_O, NULL},
_TESTINTERNALCAPI_COMPILER_CLEANDOC_METHODDEF
_TESTINTERNALCAPI_COMPILER_CODEGEN_METHODDEF
_TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF
_TESTINTERNALCAPI_ASSEMBLE_CODE_OBJECT_METHODDEF
Expand Down
61 changes: 60 additions & 1 deletion Modules/clinic/_testinternalcapi.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

99 changes: 98 additions & 1 deletion Python/compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -1704,10 +1704,16 @@ compiler_body(struct compiler *c, location loc, asdl_stmt_seq *stmts)
if (c->c_optimize < 2) {
docstring = _PyAST_GetDocString(stmts);
if (docstring) {
PyObject *cleandoc = _PyCompile_CleanDoc(docstring);
if (cleandoc == NULL) {
return ERROR;
}
i = 1;
st = (stmt_ty)asdl_seq_GET(stmts, 0);
assert(st->kind == Expr_kind);
VISIT(c, expr, st->v.Expr.value);
location loc = LOC(st->v.Expr.value);
ADDOP_LOAD_CONST(c, loc, cleandoc);
Py_DECREF(cleandoc);
RETURN_IF_ERROR(compiler_nameop(c, NO_LOCATION, &_Py_ID(__doc__), Store));
}
}
Expand Down Expand Up @@ -2252,11 +2258,19 @@ compiler_function_body(struct compiler *c, stmt_ty s, int is_async, Py_ssize_t f
/* if not -OO mode, add docstring */
if (c->c_optimize < 2) {
docstring = _PyAST_GetDocString(body);
if (docstring) {
docstring = _PyCompile_CleanDoc(docstring);
if (docstring == NULL) {
compiler_exit_scope(c);
return ERROR;
}
}
}
if (compiler_add_const(c->c_const_cache, c->u, docstring ? docstring : Py_None) < 0) {
compiler_exit_scope(c);
return ERROR;
}
Py_XDECREF(docstring);

c->u->u_metadata.u_argcount = asdl_seq_LEN(args->args);
c->u->u_metadata.u_posonlyargcount = asdl_seq_LEN(args->posonlyargs);
Expand Down Expand Up @@ -7967,6 +7981,89 @@ cfg_to_instructions(cfg_builder *g)
return NULL;
}

// C implementation of inspect.cleandoc()
//
// Difference from inspect.cleandoc():
// - Do not remove leading and trailing blank lines to keep lineno.
PyObject *
_PyCompile_CleanDoc(PyObject *doc)
{
doc = PyObject_CallMethod(doc, "expandtabs", NULL);
if (doc == NULL) {
return NULL;
}

Py_ssize_t doc_size;
const char *doc_utf8 = PyUnicode_AsUTF8AndSize(doc, &doc_size);
if (doc_utf8 == NULL) {
Py_DECREF(doc);
return NULL;
}
const char *p = doc_utf8;
const char *pend = p + doc_size;

// First pass: find minimum indentation of any non-blank lines
// after first line.
while (p < pend && *p++ != '\n') {
}

Py_ssize_t margin = PY_SSIZE_T_MAX;
while (p < pend) {
const char *s = p;
while (*p == ' ') p++;
if (p < pend && *p != '\n') {
margin = Py_MIN(margin, p - s);
}
while (p < pend && *p++ != '\n') {
}
}
if (margin == PY_SSIZE_T_MAX) {
margin = 0;
}

// Second pass: write cleandoc into buff.

// copy first line without leading spaces.
p = doc_utf8;
while (*p == ' ') {
p++;
}
if (p == doc_utf8 && margin == 0 ) {
// doc is already clean.
return doc;
}

char *buff = PyMem_Malloc(doc_size);
char *w = buff;

while (p < pend) {
int ch = *w++ = *p++;
if (ch == '\n') {
break;
}
}

// copy subsequent lines without margin.
while (p < pend) {
for (Py_ssize_t i = 0; i < margin; i++, p++) {
if (*p != ' ') {
assert(*p == '\n' || *p == '\0');
break;
}
}
while (p < pend) {
int ch = *w++ = *p++;
if (ch == '\n') {
break;
}
}
}

Py_DECREF(doc);
return PyUnicode_FromStringAndSize(buff, w - buff);
}


PyObject *
_PyCompile_CodeGen(PyObject *ast, PyObject *filename, PyCompilerFlags *pflags,
int optimize, int compile_mode)
Expand Down

0 comments on commit 2566b74

Please sign in to comment.