From c0e39927334ada6619b4b7995744853b763f757e Mon Sep 17 00:00:00 2001 From: Vincent Palmer Date: Wed, 22 Apr 2026 14:37:23 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20revert=20rayon=20from=20weight=20convers?= =?UTF-8?q?ion=20=E2=80=94=2063%=20slower=20due=20to=20memory=20contention?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rayon par_iter on 35 layers caused memory bandwidth saturation on 32GB system with 10GB mmap + 4GB student model. Sequential conversion is faster because single thread has exclusive bandwidth access. Conversion: 770s (rayon) → expected ~470s (sequential, matching PR #60) Prefill: 123s (rayon build) → expected ~42s (matching PR #60) Keep rayon for ternary_matmul_parallel() only (small matrices, no contention). --- src/model/cpu_block_attn_res.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/model/cpu_block_attn_res.rs b/src/model/cpu_block_attn_res.rs index eda7609..f4c3fd3 100644 --- a/src/model/cpu_block_attn_res.rs +++ b/src/model/cpu_block_attn_res.rs @@ -1,6 +1,5 @@ use crate::model::cpu_linear::{CpuLinear, CpuRmsNorm}; use crate::model::cpu_moe::CpuMoELayer; -use rayon::prelude::*; use crate::model::gemma_mapper::{matmul, rms_norm, apply_rope, apply_rope_gqa, gelu_tanh}; use crate::model::gemma_mapper::{MappedGemma4Model, Gemma4FfnWeights}; @@ -2148,7 +2147,7 @@ pub fn gemma4_to_block_attnres(teacher: &MappedGemma4Model) -> CpuBlockAttnResMo let vs = config.vocab_size; let first_shared_layer = config.num_layers.saturating_sub(config.num_kv_shared_layers); - let layers: Vec = teacher.layers.par_iter().enumerate().map(|(layer_idx, layer_weights)| { + let layers: Vec = teacher.layers.iter().enumerate().map(|(layer_idx, layer_weights)| { let layer_head_dim = layer_weights.attn.head_dim; let layer_q_dim = layer_weights.attn.q_dim; let layer_kv_dim = layer_weights.attn.kv_dim;