From ece341cd7c7facd3114649ef4404f8777baf9013 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 19 Apr 2022 18:21:55 -0700 Subject: [PATCH 1/4] Make MSVC generate somewhat faster switch code Apparently a switch on an 8-bit quantity where all cases are present generates a more efficient jump (doing only one indexed memory load instead of two). See https://github.com/faster-cpython/ideas/issues/321#issuecomment-1103263673 --- Include/opcode.h | 78 ++++++++++++++++++++++++++++++ Python/ceval.c | 6 ++- Tools/scripts/generate_opcode_h.py | 7 +++ 3 files changed, 89 insertions(+), 2 deletions(-) diff --git a/Include/opcode.h b/Include/opcode.h index 8db42380cedd11..89f56e27ed4372 100644 --- a/Include/opcode.h +++ b/Include/opcode.h @@ -707,6 +707,84 @@ static const char *const _PyOpcode_OpName[256] = { }; #endif +#define EXTRA_CASES \ + case 180: \ + case 181: \ + case 182: \ + case 183: \ + case 184: \ + case 185: \ + case 186: \ + case 187: \ + case 188: \ + case 189: \ + case 190: \ + case 191: \ + case 192: \ + case 193: \ + case 194: \ + case 195: \ + case 196: \ + case 197: \ + case 198: \ + case 199: \ + case 200: \ + case 201: \ + case 202: \ + case 203: \ + case 204: \ + case 205: \ + case 206: \ + case 207: \ + case 208: \ + case 209: \ + case 210: \ + case 211: \ + case 212: \ + case 213: \ + case 214: \ + case 215: \ + case 216: \ + case 217: \ + case 218: \ + case 219: \ + case 220: \ + case 221: \ + case 222: \ + case 223: \ + case 224: \ + case 225: \ + case 226: \ + case 227: \ + case 228: \ + case 229: \ + case 230: \ + case 231: \ + case 232: \ + case 233: \ + case 234: \ + case 235: \ + case 236: \ + case 237: \ + case 238: \ + case 239: \ + case 240: \ + case 241: \ + case 242: \ + case 243: \ + case 244: \ + case 245: \ + case 246: \ + case 247: \ + case 248: \ + case 249: \ + case 250: \ + case 251: \ + case 252: \ + case 253: \ + case 254: \ + ; + #define HAS_ARG(op) ((op) >= HAVE_ARGUMENT) /* Reserve some bytecodes for internal use in the compiler. diff --git a/Python/ceval.c b/Python/ceval.c index 45754ffbe7cb4a..94fa98f41e72ea 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1774,7 +1774,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int { #else dispatch_opcode: - switch (opcode) { + // The cast, combined with the EXTRA_CASES macro, + // causes MSVC to generate a faster switch. + switch ((unsigned char)opcode) { #endif /* BEWARE! @@ -5645,7 +5647,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #if USE_COMPUTED_GOTOS _unknown_opcode: #else - default: + EXTRA_CASES // From opcode.h, a 'case' for each unused opcode #endif fprintf(stderr, "XXX lineno: %d, opcode: %d\n", _PyInterpreterFrame_GetLine(frame), opcode); diff --git a/Tools/scripts/generate_opcode_h.py b/Tools/scripts/generate_opcode_h.py index a13fa778db4ea8..cd3df40e65ae48 100644 --- a/Tools/scripts/generate_opcode_h.py +++ b/Tools/scripts/generate_opcode_h.py @@ -129,6 +129,13 @@ def main(opcode_py, outfile='Include/opcode.h'): fobj.write("};\n") fobj.write("#endif\n") + fobj.write("\n") + fobj.write("#define EXTRA_CASES \\\n") + for i, flag in enumerate(used): + if not flag and i != 255: + fobj.write(f" case {i}: \\\n") + fobj.write(" ;\n") + fobj.write(footer) From 51ec1089e471f962a4a240a2bca8aee3a2e831c6 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 20 Apr 2022 11:18:48 -0700 Subject: [PATCH 2/4] Make opcode and use_tracing uint8_t --- Include/cpython/pystate.h | 2 +- Python/ceval.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index 1af21a2c947d99..2bd46067cbbe18 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -46,7 +46,7 @@ typedef struct _PyCFrame { * discipline and make sure that instances of this struct cannot * accessed outside of their lifetime. */ - int use_tracing; + uint8_t use_tracing; // 0 or 255 (or'ed into opcode, hence 8-bit type) /* Pointer to the currently executing frame (it can be NULL) */ struct _PyInterpreterFrame *current_frame; struct _PyCFrame *previous; diff --git a/Python/ceval.c b/Python/ceval.c index 94fa98f41e72ea..5d5f418ab63725 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1662,7 +1662,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #ifdef Py_STATS int lastopcode = 0; #endif - int opcode; /* Current opcode */ + uint8_t opcode; /* Current opcode */ int oparg; /* Current opcode argument, if any */ _Py_atomic_int * const eval_breaker = &tstate->interp->ceval.eval_breaker; @@ -1776,7 +1776,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int dispatch_opcode: // The cast, combined with the EXTRA_CASES macro, // causes MSVC to generate a faster switch. - switch ((unsigned char)opcode) { + switch (opcode) { #endif /* BEWARE! From 6603d85be8585b5d753e97a3d22df002e4fd5806 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 20 Apr 2022 15:32:53 -0700 Subject: [PATCH 3/4] Fix/move comment about opcode and switch --- Python/ceval.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 5d5f418ab63725..1cef657ea96e00 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1662,6 +1662,8 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #ifdef Py_STATS int lastopcode = 0; #endif + // opcode is an 8-bit value to improve the code generated by MSVC + // for the big switch below (in combination with the EXTRA_CASES macro). uint8_t opcode; /* Current opcode */ int oparg; /* Current opcode argument, if any */ _Py_atomic_int * const eval_breaker = &tstate->interp->ceval.eval_breaker; @@ -1774,8 +1776,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int { #else dispatch_opcode: - // The cast, combined with the EXTRA_CASES macro, - // causes MSVC to generate a faster switch. switch (opcode) { #endif From 558d2c482d9c9e3c57f28398b725c656ea2d8260 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 20 Apr 2022 15:34:42 -0700 Subject: [PATCH 4/4] No need to check for opcode 255 any more --- Tools/scripts/generate_opcode_h.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/scripts/generate_opcode_h.py b/Tools/scripts/generate_opcode_h.py index cd3df40e65ae48..1b45020835dd40 100644 --- a/Tools/scripts/generate_opcode_h.py +++ b/Tools/scripts/generate_opcode_h.py @@ -132,7 +132,7 @@ def main(opcode_py, outfile='Include/opcode.h'): fobj.write("\n") fobj.write("#define EXTRA_CASES \\\n") for i, flag in enumerate(used): - if not flag and i != 255: + if not flag: fobj.write(f" case {i}: \\\n") fobj.write(" ;\n")