@@ -47,18 +47,18 @@ jit_error(const char *message)
4747 PyErr_Format (PyExc_RuntimeWarning , "JIT %s (%d)" , message , hint );
4848}
4949
50- static char *
50+ static unsigned char *
5151jit_alloc (size_t size )
5252{
5353 assert (size );
5454 assert (size % get_page_size () == 0 );
5555#ifdef MS_WINDOWS
5656 int flags = MEM_COMMIT | MEM_RESERVE ;
57- char * memory = VirtualAlloc (NULL , size , flags , PAGE_READWRITE );
57+ unsigned char * memory = VirtualAlloc (NULL , size , flags , PAGE_READWRITE );
5858 int failed = memory == NULL ;
5959#else
6060 int flags = MAP_ANONYMOUS | MAP_PRIVATE ;
61- char * memory = mmap (NULL , size , PROT_READ | PROT_WRITE , flags , -1 , 0 );
61+ unsigned char * memory = mmap (NULL , size , PROT_READ | PROT_WRITE , flags , -1 , 0 );
6262 int failed = memory == MAP_FAILED ;
6363#endif
6464 if (failed ) {
@@ -69,7 +69,7 @@ jit_alloc(size_t size)
6969}
7070
7171static int
72- jit_free (char * memory , size_t size )
72+ jit_free (unsigned char * memory , size_t size )
7373{
7474 assert (size );
7575 assert (size % get_page_size () == 0 );
@@ -86,7 +86,7 @@ jit_free(char *memory, size_t size)
8686}
8787
8888static int
89- mark_executable (char * memory , size_t size )
89+ mark_executable (unsigned char * memory , size_t size )
9090{
9191 if (size == 0 ) {
9292 return 0 ;
@@ -113,7 +113,7 @@ mark_executable(char *memory, size_t size)
113113}
114114
115115static int
116- mark_readable (char * memory , size_t size )
116+ mark_readable (unsigned char * memory , size_t size )
117117{
118118 if (size == 0 ) {
119119 return 0 ;
@@ -169,18 +169,20 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
169169// Fill all of stencil's holes in the memory pointed to by base, using the
170170// values in patches.
171171static void
172- patch (char * base , const Stencil * stencil , uint64_t * patches )
172+ patch (unsigned char * base , const Stencil * stencil , uint64_t * patches )
173173{
174174 for (uint64_t i = 0 ; i < stencil -> holes_size ; i ++ ) {
175175 const Hole * hole = & stencil -> holes [i ];
176- void * location = base + hole -> offset ;
176+ unsigned char * location = base + hole -> offset ;
177177 uint64_t value = patches [hole -> value ] + (uint64_t )hole -> symbol + hole -> addend ;
178+ uint8_t * loc8 = (uint8_t * )location ;
178179 uint32_t * loc32 = (uint32_t * )location ;
179180 uint64_t * loc64 = (uint64_t * )location ;
180181 // LLD is a great reference for performing relocations... just keep in
181182 // mind that Tools/jit/build.py does filtering and preprocessing for us!
182183 // Here's a good place to start for each platform:
183184 // - aarch64-apple-darwin:
185+ // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp
184186 // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp
185187 // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h
186188 // - aarch64-unknown-linux-gnu:
@@ -208,6 +210,47 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
208210 // 64-bit absolute address.
209211 * loc64 = value ;
210212 continue ;
213+ case HoleKind_R_X86_64_GOTPCRELX :
214+ case HoleKind_R_X86_64_REX_GOTPCRELX :
215+ case HoleKind_X86_64_RELOC_GOT :
216+ case HoleKind_X86_64_RELOC_GOT_LOAD : {
217+ // 32-bit relative address.
218+ // Try to relax the GOT load into an immediate value:
219+ uint64_t relaxed = * (uint64_t * )(value + 4 ) - 4 ;
220+ if ((int64_t )relaxed - (int64_t )location >= - (1LL << 31 ) &&
221+ (int64_t )relaxed - (int64_t )location + 1 < (1LL << 31 ))
222+ {
223+ if (loc8 [-2 ] == 0x8B ) {
224+ // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX]
225+ loc8 [-2 ] = 0x8D ;
226+ value = relaxed ;
227+ }
228+ else if (loc8 [-2 ] == 0xFF && loc8 [-1 ] == 0x15 ) {
229+ // call qword ptr [rip + AAA] -> nop; call XXX
230+ loc8 [-2 ] = 0x90 ;
231+ loc8 [-1 ] = 0xE8 ;
232+ value = relaxed ;
233+ }
234+ else if (loc8 [-2 ] == 0xFF && loc8 [-1 ] == 0x25 ) {
235+ // jmp qword ptr [rip + AAA] -> nop; jmp XXX
236+ loc8 [-2 ] = 0x90 ;
237+ loc8 [-1 ] = 0xE9 ;
238+ value = relaxed ;
239+ }
240+ }
241+ }
242+ // Fall through...
243+ case HoleKind_R_X86_64_GOTPCREL :
244+ case HoleKind_R_X86_64_PC32 :
245+ case HoleKind_X86_64_RELOC_SIGNED :
246+ case HoleKind_X86_64_RELOC_BRANCH :
247+ // 32-bit relative address.
248+ value -= (uint64_t )location ;
249+ // Check that we're not out of range of 32 signed bits:
250+ assert ((int64_t )value >= - (1LL << 31 ));
251+ assert ((int64_t )value < (1LL << 31 ));
252+ loc32 [0 ] = (uint32_t )value ;
253+ continue ;
211254 case HoleKind_R_AARCH64_CALL26 :
212255 case HoleKind_R_AARCH64_JUMP26 :
213256 // 28-bit relative branch.
@@ -249,10 +292,53 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
249292 set_bits (loc32 , 5 , value , 48 , 16 );
250293 continue ;
251294 case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21 :
295+ case HoleKind_R_AARCH64_ADR_GOT_PAGE :
252296 // 21-bit count of pages between this page and an absolute address's
253297 // page... I know, I know, it's weird. Pairs nicely with
254298 // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below).
255299 assert (IS_AARCH64_ADRP (* loc32 ));
300+ // Try to relax the pair of GOT loads into an immediate value:
301+ const Hole * next_hole = & stencil -> holes [i + 1 ];
302+ if (i + 1 < stencil -> holes_size &&
303+ (next_hole -> kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 ||
304+ next_hole -> kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC ) &&
305+ next_hole -> offset == hole -> offset + 4 &&
306+ next_hole -> symbol == hole -> symbol &&
307+ next_hole -> addend == hole -> addend &&
308+ next_hole -> value == hole -> value )
309+ {
310+ unsigned char rd = get_bits (loc32 [0 ], 0 , 5 );
311+ assert (IS_AARCH64_LDR_OR_STR (loc32 [1 ]));
312+ unsigned char rt = get_bits (loc32 [1 ], 0 , 5 );
313+ unsigned char rn = get_bits (loc32 [1 ], 5 , 5 );
314+ assert (rd == rn && rn == rt );
315+ uint64_t relaxed = * (uint64_t * )value ;
316+ if (relaxed < (1UL << 16 )) {
317+ // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop
318+ loc32 [0 ] = 0xD2800000 | (get_bits (relaxed , 0 , 16 ) << 5 ) | rd ;
319+ loc32 [1 ] = 0xD503201F ;
320+ i ++ ;
321+ continue ;
322+ }
323+ if (relaxed < (1ULL << 32 )) {
324+ // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY
325+ loc32 [0 ] = 0xD2800000 | (get_bits (relaxed , 0 , 16 ) << 5 ) | rd ;
326+ loc32 [1 ] = 0xF2A00000 | (get_bits (relaxed , 16 , 16 ) << 5 ) | rd ;
327+ i ++ ;
328+ continue ;
329+ }
330+ relaxed = (uint64_t )value - (uint64_t )location ;
331+ if ((relaxed & 0x3 ) == 0 &&
332+ (int64_t )relaxed >= - (1L << 19 ) &&
333+ (int64_t )relaxed < (1L << 19 ))
334+ {
335+ // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop
336+ loc32 [0 ] = 0x58000000 | (get_bits (relaxed , 2 , 19 ) << 5 ) | rd ;
337+ loc32 [1 ] = 0xD503201F ;
338+ i ++ ;
339+ continue ;
340+ }
341+ }
256342 // Number of pages between this page and the value's page:
257343 value = (value >> 12 ) - ((uint64_t )location >> 12 );
258344 // Check that we're not out of range of 21 signed bits:
@@ -264,6 +350,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
264350 set_bits (loc32 , 5 , value , 2 , 19 );
265351 continue ;
266352 case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 :
353+ case HoleKind_R_AARCH64_LD64_GOT_LO12_NC :
267354 // 12-bit low part of an absolute address. Pairs nicely with
268355 // ARM64_RELOC_GOT_LOAD_PAGE21 (above).
269356 assert (IS_AARCH64_LDR_OR_STR (* loc32 ) || IS_AARCH64_ADD_OR_SUB (* loc32 ));
@@ -285,7 +372,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
285372}
286373
287374static void
288- copy_and_patch (char * base , const Stencil * stencil , uint64_t * patches )
375+ copy_and_patch (unsigned char * base , const Stencil * stencil , uint64_t * patches )
289376{
290377 memcpy (base , stencil -> body , stencil -> body_size );
291378 patch (base , stencil , patches );
@@ -294,8 +381,8 @@ copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches)
294381static void
295382emit (const StencilGroup * group , uint64_t patches [])
296383{
297- copy_and_patch ((char * )patches [HoleValue_CODE ], & group -> code , patches );
298- copy_and_patch ((char * )patches [HoleValue_DATA ], & group -> data , patches );
384+ copy_and_patch ((unsigned char * )patches [HoleValue_DATA ], & group -> data , patches );
385+ copy_and_patch ((unsigned char * )patches [HoleValue_CODE ], & group -> code , patches );
299386}
300387
301388// Compiles executor in-place. Don't forget to call _PyJIT_Free later!
@@ -316,14 +403,14 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
316403 assert ((page_size & (page_size - 1 )) == 0 );
317404 code_size += page_size - (code_size & (page_size - 1 ));
318405 data_size += page_size - (data_size & (page_size - 1 ));
319- char * memory = jit_alloc (code_size + data_size );
406+ unsigned char * memory = jit_alloc (code_size + data_size );
320407 if (memory == NULL ) {
321408 return -1 ;
322409 }
323410 // Loop again to emit the code:
324- char * code = memory ;
325- char * data = memory + code_size ;
326- char * top = code ;
411+ unsigned char * code = memory ;
412+ unsigned char * data = memory + code_size ;
413+ unsigned char * top = code ;
327414 if (trace [0 ].opcode == _START_EXECUTOR ) {
328415 // Don't want to execute this more than once:
329416 top += stencil_groups [_START_EXECUTOR ].code .body_size ;
@@ -360,7 +447,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
360447void
361448_PyJIT_Free (_PyExecutorObject * executor )
362449{
363- char * memory = (char * )executor -> jit_code ;
450+ unsigned char * memory = (unsigned char * )executor -> jit_code ;
364451 size_t size = executor -> jit_size ;
365452 if (memory ) {
366453 executor -> jit_code = NULL ;
0 commit comments