Skip to content

Conversation

@bend-n
Copy link
Contributor

@bend-n bend-n commented Nov 20, 2025

codegen for example (sourced from #98326 (comment))

pub fn simd_sum_slow(arr: &[u32]) -> u32 {
    const STEP_SIZE: usize = 16;

    let mut result = [0; STEP_SIZE];

    let mut iter = arr.iter();

    while let Ok(c) = iter.next_chunk::<STEP_SIZE>() {
        for (&n, r) in c.iter().zip(result.iter_mut()) {
            *r += n;
        }
    }

    result.iter().sum()
}

goes from (znver4)

many asm
simd_sum_slow:
	.cfi_startproc
	push rbp
	.cfi_def_cfa_offset 16
	push r15
	.cfi_def_cfa_offset 24
	push r14
	.cfi_def_cfa_offset 32
	push r13
	.cfi_def_cfa_offset 40
	push r12
	.cfi_def_cfa_offset 48
	push rbx
	.cfi_def_cfa_offset 56
	sub rsp, 240
	.cfi_def_cfa_offset 296
	.cfi_offset rbx, -56
	.cfi_offset r12, -48
	.cfi_offset r13, -40
	.cfi_offset r14, -32
	.cfi_offset r15, -24
	.cfi_offset rbp, -16
	lea r12, [rdi + 4*rsi]
	mov qword ptr [rsp - 32], 0
	mov dword ptr [rsp - 88], 0
	mov dword ptr [rsp - 100], 0
	mov dword ptr [rsp - 72], 0
	mov dword ptr [rsp - 96], 0
	mov dword ptr [rsp - 92], 0
	mov dword ptr [rsp - 52], 0
	mov dword ptr [rsp - 84], 0
	mov dword ptr [rsp - 80], 0
	mov dword ptr [rsp - 76], 0
	mov dword ptr [rsp - 44], 0
	mov dword ptr [rsp - 68], 0
	mov dword ptr [rsp - 64], 0
	mov dword ptr [rsp - 60], 0
	mov dword ptr [rsp - 56], 0
	mov dword ptr [rsp - 48], 0
	mov qword ptr [rsp + 224], r12
	cmp rdi, r12
	mov qword ptr [rsp + 96], rdi
	je .LBB12_2
	.p2align	4
.LBB12_3:
	lea r13, [rdi + 4]
	cmp r13, r12
	je .LBB12_4
	lea r15, [rdi + 8]
	cmp r15, r12
	je .LBB12_6
	lea r14, [rdi + 12]
	cmp r14, r12
	je .LBB12_8
	lea rbx, [rdi + 16]
	cmp rbx, r12
	je .LBB12_10
	lea r11, [rdi + 20]
	cmp r11, r12
	je .LBB12_12
	lea r10, [rdi + 24]
	cmp r10, r12
	je .LBB12_14
	lea r9, [rdi + 28]
	cmp r9, r12
	je .LBB12_16
	lea r8, [rdi + 32]
	cmp r8, r12
	je .LBB12_18
	lea rax, [rdi + 36]
	cmp rax, r12
	je .LBB12_20
	mov qword ptr [rsp - 120], rax
	lea rax, [rdi + 40]
	mov qword ptr [rsp - 112], rax
	cmp rax, r12
	je .LBB12_22
	lea rdx, [rdi + 44]
	cmp rdx, r12
	je .LBB12_24
	lea rbp, [rdi + 48]
	cmp rbp, r12
	je .LBB12_26
	mov qword ptr [rsp - 40], r9
	lea rcx, [rdi + 52]
	cmp rcx, r12
	je .LBB12_30
	lea r9, [rdi + 56]
	cmp r9, r12
	je .LBB12_32
	lea rax, [rdi + 60]
	cmp rax, r12
	mov qword ptr [rsp - 24], r9
	je .LBB12_34
	mov qword ptr [rsp + 88], rax
	lea rax, [rdi + 64]
	mov qword ptr [rsp - 8], rax
	mov dword ptr [rsp - 128], 0
	mov qword ptr [rsp - 16], rdi
	mov qword ptr [rsp + 56], r10
	lea r10, [rsp + 216]
	mov qword ptr [rsp + 40], r10
	mov qword ptr [rsp + 96], r13
	lea rax, [rsp + 208]
	mov qword ptr [rsp + 32], rax
	mov qword ptr [rsp + 72], rdx
	lea rsi, [rsp + 200]
	mov qword ptr [rsp + 48], rbx
	lea rbx, [rsp + 192]
	mov qword ptr [rsp + 80], rcx
	lea rcx, [rsp + 184]
	mov qword ptr [rsp + 64], rbp
	lea rdx, [rsp + 176]
	mov qword ptr [rsp + 16], r14
	lea r14, [rsp + 168]
	mov qword ptr [rsp + 24], r11
	lea r11, [rsp + 160]
	lea r9, [rsp + 152]
	lea r12, [rsp + 144]
	lea r13, [rsp + 136]
	lea rbp, [rsp + 128]
	mov qword ptr [rsp], r15
	lea r15, [rsp + 120]
	mov qword ptr [rsp + 8], r8
	lea r8, [rsp + 112]
	lea rdi, [rsp + 104]
	jmp .LBB12_39
	.p2align	4
.LBB12_2:
	mov qword ptr [rsp - 128], 0
	jmp .LBB12_37
	.p2align	4
.LBB12_4:
	mov eax, 1
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_37
	.p2align	4
.LBB12_6:
	mov eax, 2
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_36
.LBB12_8:
	mov eax, 3
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_36
.LBB12_10:
	mov eax, 4
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_36
.LBB12_12:
	mov eax, 5
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_36
.LBB12_14:
	mov eax, 6
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_36
.LBB12_16:
	mov eax, 7
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_36
.LBB12_18:
	mov eax, 8
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_36
.LBB12_20:
	mov eax, 9
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_36
.LBB12_22:
	mov eax, 10
	mov qword ptr [rsp - 128], rax
	mov rdi, qword ptr [rsp - 120]
	jmp .LBB12_36
.LBB12_24:
	mov eax, 11
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_27
.LBB12_26:
	mov eax, 12
	mov qword ptr [rsp - 128], rax
.LBB12_27:
	mov rsi, qword ptr [rsp - 112]
	mov rdi, qword ptr [rsp - 120]
	jmp .LBB12_36
.LBB12_30:
	mov rcx, rbp
	mov eax, 13
	mov qword ptr [rsp - 128], rax
	jmp .LBB12_35
.LBB12_32:
	mov rax, rcx
	mov rcx, rbp
	mov esi, 14
	mov qword ptr [rsp - 128], rsi
	jmp .LBB12_35
.LBB12_34:
	mov rax, rcx
	mov rcx, rbp
	mov esi, 15
	mov qword ptr [rsp - 128], rsi
.LBB12_35:
	mov rsi, qword ptr [rsp - 112]
	mov rdi, qword ptr [rsp - 120]
	mov r9, qword ptr [rsp - 40]
	.p2align	4
.LBB12_36:
	mov rbp, r13
.LBB12_37:
	mov r13, qword ptr [rsp - 24]
	mov qword ptr [rsp + 88], r13
	mov r13, qword ptr [rsp - 128]
	mov qword ptr [rsp + 216], r13
	mov r13b, 1
	mov dword ptr [rsp - 128], r13d
	mov qword ptr [rsp - 24], rax
	mov qword ptr [rsp + 80], rcx
	mov qword ptr [rsp + 64], rdx
	mov qword ptr [rsp + 72], rsi
	mov qword ptr [rsp - 112], rdi
	mov qword ptr [rsp - 120], r8
	mov qword ptr [rsp + 8], r9
	mov qword ptr [rsp - 40], r10
	mov qword ptr [rsp + 56], r11
	mov qword ptr [rsp + 24], rbx
	mov qword ptr [rsp + 48], r14
	mov qword ptr [rsp + 16], r15
	mov qword ptr [rsp], rbp
	mov eax, 0
	mov qword ptr [rsp - 16], rax
	mov qword ptr [rsp - 8], r12
	lea r10, [rsp + 208]
	mov qword ptr [rsp + 40], r10
	lea rax, [rsp + 200]
	mov qword ptr [rsp + 32], rax
	lea rsi, [rsp + 192]
	lea rbx, [rsp + 184]
	lea rcx, [rsp + 176]
	lea rdx, [rsp + 168]
	lea r14, [rsp + 160]
	lea r11, [rsp + 152]
	lea r9, [rsp + 144]
	lea r12, [rsp + 136]
	lea r13, [rsp + 128]
	lea rbp, [rsp + 120]
	lea r15, [rsp + 112]
	lea r8, [rsp + 104]
	lea rdi, [rsp + 232]
.LBB12_39:
	mov r10, qword ptr [rsp + 96]
	mov rax, qword ptr [rsp + 40]
	mov qword ptr [rax], r10
	mov rax, qword ptr [rsp + 32]
	mov r10, qword ptr [rsp]
	mov qword ptr [rax], r10
	mov rax, qword ptr [rsp + 16]
	mov qword ptr [rsi], rax
	mov rax, qword ptr [rsp + 48]
	mov qword ptr [rbx], rax
	mov rax, qword ptr [rsp + 24]
	mov qword ptr [rcx], rax
	mov rax, qword ptr [rsp + 56]
	mov qword ptr [rdx], rax
	mov rax, qword ptr [rsp - 40]
	mov qword ptr [r14], rax
	mov rax, qword ptr [rsp + 8]
	mov qword ptr [r11], rax
	mov rax, qword ptr [rsp - 120]
	mov qword ptr [r9], rax
	mov rax, qword ptr [rsp - 112]
	mov qword ptr [r12], rax
	mov rax, qword ptr [rsp + 72]
	mov qword ptr [r13], rax
	mov rax, qword ptr [rsp + 64]
	mov qword ptr [rbp], rax
	mov rax, qword ptr [rsp + 80]
	mov qword ptr [r15], rax
	mov rax, qword ptr [rsp - 24]
	mov qword ptr [r8], rax
	mov rax, qword ptr [rsp + 88]
	mov qword ptr [rdi], rax
	cmp byte ptr [rsp - 128], 0
	jne .LBB12_40
	mov rax, qword ptr [rsp - 32]
	mov rcx, qword ptr [rsp - 16]
	add eax, dword ptr [rcx]
	mov qword ptr [rsp - 32], rax
	mov rax, qword ptr [rsp + 216]
	mov ecx, dword ptr [rsp - 88]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 88], ecx
	mov rax, qword ptr [rsp + 208]
	mov ecx, dword ptr [rsp - 100]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 100], ecx
	mov rax, qword ptr [rsp + 200]
	mov ecx, dword ptr [rsp - 72]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 72], ecx
	mov rax, qword ptr [rsp + 192]
	mov ecx, dword ptr [rsp - 96]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 96], ecx
	mov rax, qword ptr [rsp + 184]
	mov ecx, dword ptr [rsp - 92]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 92], ecx
	mov rax, qword ptr [rsp + 176]
	mov ecx, dword ptr [rsp - 52]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 52], ecx
	mov rax, qword ptr [rsp + 168]
	mov ecx, dword ptr [rsp - 84]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 84], ecx
	mov rax, qword ptr [rsp + 160]
	mov ecx, dword ptr [rsp - 80]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 80], ecx
	mov rax, qword ptr [rsp + 152]
	mov ecx, dword ptr [rsp - 76]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 76], ecx
	mov rax, qword ptr [rsp + 144]
	mov ecx, dword ptr [rsp - 44]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 44], ecx
	mov rax, qword ptr [rsp + 136]
	mov ecx, dword ptr [rsp - 68]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 68], ecx
	mov rax, qword ptr [rsp + 128]
	mov ecx, dword ptr [rsp - 64]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 64], ecx
	mov rax, qword ptr [rsp + 120]
	mov ecx, dword ptr [rsp - 60]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 60], ecx
	mov rax, qword ptr [rsp + 112]
	mov ecx, dword ptr [rsp - 56]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 56], ecx
	mov rax, qword ptr [rsp + 104]
	mov ecx, dword ptr [rsp - 48]
	add ecx, dword ptr [rax]
	mov dword ptr [rsp - 48], ecx
	mov rdi, qword ptr [rsp - 8]
	mov r12, qword ptr [rsp + 224]
	cmp rdi, r12
	mov qword ptr [rsp + 96], rdi
	jne .LBB12_3
	jmp .LBB12_2
.LBB12_40:
	mov eax, dword ptr [rsp - 88]
	add eax, dword ptr [rsp - 32]
	mov ecx, dword ptr [rsp - 72]
	add ecx, dword ptr [rsp - 100]
	add ecx, eax
	mov edx, dword ptr [rsp - 92]
	add edx, dword ptr [rsp - 96]
	mov eax, dword ptr [rsp - 52]
	add eax, edx
	add eax, ecx
	mov ecx, dword ptr [rsp - 80]
	add ecx, dword ptr [rsp - 84]
	mov edx, dword ptr [rsp - 76]
	add edx, ecx
	mov ecx, dword ptr [rsp - 44]
	add ecx, edx
	add ecx, eax
	mov edx, dword ptr [rsp - 64]
	add edx, dword ptr [rsp - 68]
	mov eax, dword ptr [rsp - 60]
	add eax, edx
	mov edx, dword ptr [rsp - 56]
	add edx, eax
	mov eax, dword ptr [rsp - 48]
	add eax, edx
	add eax, ecx
	add rsp, 240
	.cfi_def_cfa_offset 56
	pop rbx
	.cfi_def_cfa_offset 48
	pop r12
	.cfi_def_cfa_offset 40
	pop r13
	.cfi_def_cfa_offset 32
	pop r14
	.cfi_def_cfa_offset 24
	pop r15
	.cfi_def_cfa_offset 16
	pop rbp
	.cfi_def_cfa_offset 8
	ret

to

simd_sum_slow:
	.cfi_startproc
	xor eax, eax
	cmp rsi, 16
	jb .LBB12_4
	shl rsi, 2
	pxor xmm0, xmm0
	pxor xmm1, xmm1
	pxor xmm3, xmm3
	pxor xmm2, xmm2
	.p2align	4
.LBB12_2:
	movdqu xmm4, xmmword ptr [rdi]
	paddd xmm0, xmm4
	movdqu xmm4, xmmword ptr [rdi + 16]
	paddd xmm1, xmm4
	movdqu xmm4, xmmword ptr [rdi + 32]
	paddd xmm3, xmm4
	movdqu xmm4, xmmword ptr [rdi + 48]
	paddd xmm2, xmm4
	add rdi, 64
	add rsi, -64
	cmp rsi, 60
	ja .LBB12_2
	paddd xmm0, xmm3
	paddd xmm1, xmm2
	paddd xmm1, xmm0
	pshufd xmm0, xmm1, 238
	paddd xmm0, xmm1
	pshufd xmm1, xmm0, 85
	paddd xmm1, xmm0
	movd eax, xmm1
.LBB12_4:
	ret

@rustbot rustbot added S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. T-libs Relevant to the library team, which will review and decide on the PR/issue. labels Nov 20, 2025
@rustbot
Copy link
Collaborator

rustbot commented Nov 20, 2025

r? @Mark-Simulacrum

rustbot has assigned @Mark-Simulacrum.
They will have a look at your PR within the next two weeks and either review your PR or reassign to another reviewer.

Use r? to explicitly pick a reviewer

@bend-n bend-n force-pushed the optimize_slice_iter_next_chunk branch from 305559b to 5df6fad Compare November 20, 2025 10:35
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. T-libs Relevant to the library team, which will review and decide on the PR/issue.

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants