Skip to content

performance regression in ssl module under free-threading #140795

@kumaraditya303

Description

@kumaraditya303

On free-threading there is a large ~20% performance regression under asyncio_tcp_ssl benchmark. A large part of slowdown is from #124993 which added critical sections and locks for thread safety however 20% is large slowdown for the important single threaded use-case.

Critical sections are slow especially for extensions which are dynamically loaded because accessing thread states is slow and there are multiple function calls even for the fastpath of no contention for acquisition of critical section.

Comparing the assembly of _ssl_RAND_status in free-threading vs normal build:

  • free-threading:
Dump of assembler code for function _ssl_RAND_status:
   0x00007ffff7b97090 <+0>:     push   %rbx
   0x00007ffff7b97091 <+1>:     sub    $0x10,%rsp
   0x00007ffff7b97095 <+5>:     lea    0xa(%rdi),%rbx
   0x00007ffff7b97099 <+9>:     mov    $0x1,%cl
   0x00007ffff7b9709b <+11>:    xor    %eax,%eax
   0x00007ffff7b9709d <+13>:    lock cmpxchg %cl,0xa(%rdi)
   0x00007ffff7b970a2 <+18>:    jne    0x7ffff7b970c5 <_ssl_RAND_status+53>
   0x00007ffff7b970a4 <+20>:    call   0x7ffff7b950e0 <_PyThreadState_GetCurrent@plt>
   0x00007ffff7b970a9 <+25>:    mov    %rbx,0x8(%rsp)
   0x00007ffff7b970ae <+30>:    mov    0xb0(%rax),%rcx
   0x00007ffff7b970b5 <+37>:    mov    %rcx,(%rsp)
   0x00007ffff7b970b9 <+41>:    mov    %rsp,%rcx
   0x00007ffff7b970bc <+44>:    mov    %rcx,0xb0(%rax)
   0x00007ffff7b970c3 <+51>:    jmp    0x7ffff7b970d0 <_ssl_RAND_status+64>
   0x00007ffff7b970c5 <+53>:    mov    %rsp,%rdi
   0x00007ffff7b970c8 <+56>:    mov    %rbx,%rsi
   0x00007ffff7b970cb <+59>:    call   0x7ffff7b95cc0 <_PyCriticalSection_BeginSlow@plt>
   0x00007ffff7b970d0 <+64>:    call   0x7ffff7b95c30 <RAND_status@plt>
   0x00007ffff7b970d5 <+69>:    movslq %eax,%rdi
   0x00007ffff7b970d8 <+72>:    call   0x7ffff7b96560 <PyBool_FromLong@plt>
   0x00007ffff7b970dd <+77>:    mov    %rax,%rbx
   0x00007ffff7b970e0 <+80>:    mov    0x8(%rsp),%rdi
   0x00007ffff7b970e5 <+85>:    test   %rdi,%rdi
   0x00007ffff7b970e8 <+88>:    je     0x7ffff7b97116 <_ssl_RAND_status+134>
   0x00007ffff7b970ea <+90>:    xor    %ecx,%ecx
   0x00007ffff7b970ec <+92>:    mov    $0x1,%al
   0x00007ffff7b970ee <+94>:    lock cmpxchg %cl,(%rdi)
   0x00007ffff7b970f2 <+98>:    je     0x7ffff7b970f9 <_ssl_RAND_status+105>
   0x00007ffff7b970f4 <+100>:   call   0x7ffff7b954a0 <PyMutex_Unlock@plt>
   0x00007ffff7b970f9 <+105>:   call   0x7ffff7b950e0 <_PyThreadState_GetCurrent@plt>
   0x00007ffff7b970fe <+110>:   mov    (%rsp),%rcx
   0x00007ffff7b97102 <+114>:   mov    %rcx,0xb0(%rax)
   0x00007ffff7b97109 <+121>:   test   $0x1,%cl
   0x00007ffff7b9710c <+124>:   je     0x7ffff7b97116 <_ssl_RAND_status+134>
   0x00007ffff7b9710e <+126>:   mov    %rax,%rdi
   0x00007ffff7b97111 <+129>:   call   0x7ffff7b95f90 <_PyCriticalSection_Resume@plt>
   0x00007ffff7b97116 <+134>:   mov    %rbx,%rax
   0x00007ffff7b97119 <+137>:   add    $0x10,%rsp
   0x00007ffff7b9711d <+141>:   pop    %rbx
   0x00007ffff7b9711e <+142>:   ret
End of assembler dump.
  • normal build:
   0x00007ffff773bd00 <+0>:     push   %rax
   0x00007ffff773bd01 <+1>:     call   0x7ffff773ac20 <RAND_status@plt>
   0x00007ffff773bd06 <+6>:     movslq %eax,%rdi
   0x00007ffff773bd09 <+9>:     pop    %rax
   0x00007ffff773bd0a <+10>:    jmp    0x7ffff773b520 <PyBool_FromLong@plt>

Linked PRs

Metadata

Metadata

Assignees

No one assigned

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions