From 14539420801b62cae53b46cbae29eaa7c02286f8 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Sun, 22 Jun 2025 22:33:59 +0530 Subject: [PATCH 01/22] Announcement update for Granite Vision (#474) Announcement update for Granite Vision in 1.20.0 release --------- Signed-off-by: Dipankar Sarkar --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 38fa8c595..9149864df 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ *Latest news* :fire:
- [06/2025] Added support for Llama4 Multi-Model [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) - [06/2025] Added support of model `hpcai-tech/grok-1` [hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1) +- [04/2025] Added support of model `ibm-granite/granite-vision-3.2-2b`[ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) - [03/2025] Added support for swiftkv model [Snowflake/Llama-3.1-SwiftKV-8B-Instruct](https://huggingface.co/Snowflake/Llama-3.1-SwiftKV-8B-Instruct) - [02/2025] [VLMs support](https://github.com/quic/efficient-transformers/pull/267) added for the models [InternVL-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B), [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [Mllama](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) - [01/2025] [FP8 models support](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127) Added support for inference of FP8 models. From 740f7c23409c17f4d576c241d9c43f30221dbabf Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Mon, 23 Jun 2025 14:24:27 +0530 Subject: [PATCH 02/22] Fixes for mllama (#462) Addressed The issue faced during mllama dual qpc compilation Added coditions for mllama in modeling_auto --------- Signed-off-by: Dipankar Sarkar --- QEfficient/transformers/models/mllama/modeling_mllama.py | 8 +++++--- QEfficient/transformers/models/modeling_auto.py | 7 ++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 8d7d4b779..1cfafae58 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -841,6 +841,7 @@ def get_qeff_language_decoder(self): def forward( self, input_ids: Optional[torch.LongTensor] = None, + image_idx: Optional[torch.LongTensor] = None, pixel_values: Optional[torch.FloatTensor] = None, aspect_ratio_mask: Optional[torch.Tensor] = None, aspect_ratio_ids: Optional[torch.Tensor] = None, @@ -924,8 +925,8 @@ def forward( return_dict=return_dict, cache_position=cache_position, ) - outputs["pixel_values"] = pixel_values - return outputs + + return outputs.logits, image_idx, outputs.past_key_values, pixel_values def get_dummy_inputs(self, kv_offload: bool = False): BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE @@ -961,6 +962,7 @@ def get_dummy_inputs(self, kv_offload: bool = False): # lang_inputs lang_inputs = { "input_ids": torch.zeros((BS, SEQ_LEN), dtype=torch.int64), + "image_idx": torch.zeros((1, 1), dtype=torch.int64), "cross_attention_mask": torch.zeros((BS, SEQ_LEN, MAX_NUM_IMG, max_num_img_tiles), dtype=torch.int64), "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64), } @@ -1087,7 +1089,6 @@ def get_output_names(self, kv_offload: bool = False): for i in self.config.text_config.cross_attention_layers: vision_output_names.append(f"past_key.{i}") vision_output_names.append(f"past_value.{i}") - lang_output_names = [ "logits", *[f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"]], @@ -1096,6 +1097,7 @@ def get_output_names(self, kv_offload: bool = False): lang_output_names.append("pixel_values_RetainedState") output_names = {} + lang_output_names.insert(1, "image_idx_output") if kv_offload: output_names["vision"] = vision_output_names output_names["lang"] = lang_output_names diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 3e85a8e24..6bff10f5a 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -486,7 +486,6 @@ def model_hash(self) -> str: mhash.update(to_hashable(self.model.model.config.to_diff_dict())) mhash.update(to_hashable(self._transform_names())) mhash.update(to_hashable({"QEffVisionEncoderForTextImageToTextModel": True})) - if hasattr(self.model, "model"): mhash.update(to_hashable(self.model.model.pretrained_model_name_or_path)) else: @@ -555,7 +554,10 @@ def model_hash(self) -> str: mhash.update(to_hashable(self.model.config.to_diff_dict())) mhash.update(to_hashable(self._transform_names())) mhash.update(to_hashable({"QEffCausalLMForTextImageToTextModel": True})) - mhash.update(to_hashable(self.model.model.pretrained_model_name_or_path)) + if hasattr(self.model, "model"): + mhash.update(to_hashable(self.model.model.pretrained_model_name_or_path)) + else: + mhash.update(to_hashable(self.model.pretrained_model_name_or_path)) mhash = mhash.hexdigest()[:16] return mhash @@ -2119,7 +2121,6 @@ def generate( raise TypeError("Please run compile API first!") inputs = self.auto_correct_inputs(inputs) - if self.qpc_session is None: self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) self.batch_size = self.qpc_session.bindings[0].dims[0] From 61b14459da6e5dd4b2079e78fd869f6b979876a1 Mon Sep 17 00:00:00 2001 From: Shagun Sood <168412978+quic-shagun@users.noreply.github.com> Date: Tue, 24 Jun 2025 21:10:04 -0700 Subject: [PATCH 03/22] BugFix: Fix reshape error for llama swiftkv models (#432) This fixed the issue with higher BS compilation for SwiftKV models ``` Compiler command: ['/opt/qti-aic/exec/qaic-exec', '-aic-hw', '-aic-hw-version=2.0', '-m=/prj/qct/aisyssol_scratch/users/shagsood/quic_shagun/LlamaSwiftKVForCausalLM-a5879ebc0e59ab40/LlamaSwiftKVForCausal LM.onnx', '-compile-only', '-retained-state', '-convert-to-fp16', '-aic-num-cores=16', '-network-specialization-config=/prj/qct/aisyssol_scratch/users/shagsood/quic_shagun/LlamaSwiftKVForCausalLM-a5879eb c0e59ab40/qpc-60f86f912a187346/specializations.json', '-custom-IO-list-file=/prj/qct/aisyssol_scratch/users/shagsood/quic_shagun/LlamaSwiftKVForCausalLM-a5879ebc0e59ab40/qpc-60f86f912a187346/custom_io.ya ml', '-mdp-load-partition-config=/prj/qct/aisyssol_scratch/users/shagsood/quic_shagun/LlamaSwiftKVForCausalLM-a5879ebc0e59ab40/qpc-60f86f912a187346/mdp_ts_4.json', '-aic-binary-dir=/prj/qct/aisyssol_scra tch/users/shagsood/quic_shagun/LlamaSwiftKVForCausalLM-a5879ebc0e59ab40/qpc-60f86f912a187346/qpc'] Compiler exitcode: 1 Compiler stderr: QAIC_ERROR: Error message: [Operator-'/model/layers.16/self_attn/Reshape'] : Reshape: input shape (4, 4, 4096) and output shape (4, 1, 32, 128) have different number of elements (in 65536 vs. out 16384) Unable to AddNodesToGraphFromModel ``` Tested with BS4. Able to compile now Signed-off-by: quic-shagun --- .../models/llama_swiftkv/modeling_llama_swiftkv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py index f6cf2de49..7b96aefcc 100644 --- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py +++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py @@ -371,8 +371,8 @@ def forward( hidden_states = orig_hidden_states[torch.arange(orig_hidden_states.shape[0]).reshape(-1, 1), last_pos_id, :] causal_mask = causal_mask[torch.arange(orig_hidden_states.shape[0]).reshape(-1, 1), :, last_pos_id, :] else: - hidden_states = orig_hidden_states[torch.arange(bsz), last_pos_id, :] - causal_mask = causal_mask[torch.arange(bsz), :, last_pos_id, :] + hidden_states = orig_hidden_states[torch.arange(bsz).reshape(-1, 1), last_pos_id, :] + causal_mask = causal_mask[torch.arange(bsz).reshape(-1, 1), :, last_pos_id, :] hidden_states, next_decoder_cache = self._run_swiftkv_layers( hidden_states, position_ids, past_key_values, causal_mask, batch_index From eff9472034784c85ca707e5d17aa98b9c5a7e23c Mon Sep 17 00:00:00 2001 From: quic-akuruvil Date: Wed, 25 Jun 2025 19:13:51 +0530 Subject: [PATCH 04/22] Gemma 3 minor fixes (#476) CI enablement and other minor fixes for Gemma3 --------- Signed-off-by: Ann Kuruvilla --- QEfficient/transformers/cache_utils.py | 2 - .../models/gemma3/modeling_gemma3.py | 5 +-- README.md | 1 + docs/source/validate.md | 2 + examples/gemma3_example/fp32_mm.yaml | 4 +- examples/gemma3_example/gemma3_mm.py | 23 +++++----- .../models/test_image_text_to_text_models.py | 45 +++++++++---------- 7 files changed, 39 insertions(+), 43 deletions(-) diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py index 7162b856a..16767fbe2 100644 --- a/QEfficient/transformers/cache_utils.py +++ b/QEfficient/transformers/cache_utils.py @@ -288,7 +288,6 @@ def from_legacy_cache( class QEffHybridCache(HybridCache): def __init__(self, config, batch_size, max_cache_len): super().__init__(config, batch_size, max_cache_len=max_cache_len) - # breakpoint() self.key_cache: List[torch.Tensor] = [] self.value_cache: List[torch.Tensor] = [] @@ -327,7 +326,6 @@ def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]: """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format. Used for backward compatibility.""" legacy_cache = () - # breakpoint() for layer_idx in range(len(self)): legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),) return legacy_cache diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py index bda5959a7..9e9544b7e 100644 --- a/QEfficient/transformers/models/gemma3/modeling_gemma3.py +++ b/QEfficient/transformers/models/gemma3/modeling_gemma3.py @@ -238,9 +238,9 @@ def forward( ) kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx) if self.is_sliding: - cos, sin = self.rotary_emb_local(value_states, seq_len=constants.GEMMA3_MAX_POSITION_EMBEDDINGS) + cos, sin = self.rotary_emb_local(value_states, seq_len=self.config.max_position_embeddings) else: - cos, sin = self.rotary_emb(value_states, seq_len=constants.GEMMA3_MAX_POSITION_EMBEDDINGS) + cos, sin = self.rotary_emb(value_states, seq_len=self.config.max_position_embeddings) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -687,7 +687,6 @@ def get_specializations( "mm_tokens_per_image": mm_tokens_per_image, }, ] - specializations = {} if kv_offload: diff --git a/README.md b/README.md index 9149864df..2edb65797 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ *Latest news* :fire:
- [06/2025] Added support for Llama4 Multi-Model [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) +- [06/2025] Added support for Gemma3 Multi-Modal-Model [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it) - [06/2025] Added support of model `hpcai-tech/grok-1` [hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1) - [04/2025] Added support of model `ibm-granite/granite-vision-3.2-2b`[ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) - [03/2025] Added support for swiftkv model [Snowflake/Llama-3.1-SwiftKV-8B-Instruct](https://huggingface.co/Snowflake/Llama-3.1-SwiftKV-8B-Instruct) diff --git a/docs/source/validate.md b/docs/source/validate.md index b12db2287..c10d68daf 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -63,6 +63,8 @@ | **MllamaForConditionalGeneration** | Llama 3.2 | [meta-llama/Llama-3.2-11B-Vision Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision) | |**LlavaNextForConditionalGeneration** | Granite Vision | [ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) |**Llama4ForConditionalGeneration** | Llama-4-Scout | [Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) +|**Gemma3ForConditionalGeneration** | Gemma3 | [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it) + ### Audio Models (Automatic Speech Recognition) - Transcription Task **QEff Auto Class:** `QEFFAutoModelForSpeechSeq2Seq` diff --git a/examples/gemma3_example/fp32_mm.yaml b/examples/gemma3_example/fp32_mm.yaml index 3414f2a54..28e7485fa 100755 --- a/examples/gemma3_example/fp32_mm.yaml +++ b/examples/gemma3_example/fp32_mm.yaml @@ -370,7 +370,7 @@ FP32NodeInstanceNames: - /language_model/model/layers.4/self_attn/Mul_6_output_0 - /language_model/model/layers.4/self_attn/Mul_7_output_0 - /language_model/model/layers.4/self_attn/Mul_8_output_0 - - /language_model/model/layers.4/self_attn/Mul_9_output_0 [274/1312] + - /language_model/model/layers.4/self_attn/Mul_9_output_0 - /language_model/model/layers.5/self_attn/Mul_output_0 - /language_model/model/layers.5/self_attn/Mul_1_output_0 - /language_model/model/layers.5/self_attn/Mul_2_output_0 @@ -415,7 +415,7 @@ FP32NodeInstanceNames: - /language_model/model/layers.9/self_attn/Mul_1_output_0 - /language_model/model/layers.9/self_attn/Mul_2_output_0 - /language_model/model/layers.9/self_attn/Mul_3_output_0 - - /language_model/model/layers.9/self_attn/Mul_4_output_0 [229/1312] + - /language_model/model/layers.9/self_attn/Mul_4_output_0 - /language_model/model/layers.9/self_attn/Mul_5_output_0 - /language_model/model/layers.9/self_attn/Mul_6_output_0 - /language_model/model/layers.9/self_attn/Mul_7_output_0 diff --git a/examples/gemma3_example/gemma3_mm.py b/examples/gemma3_example/gemma3_mm.py index 717049d13..f48d2d307 100644 --- a/examples/gemma3_example/gemma3_mm.py +++ b/examples/gemma3_example/gemma3_mm.py @@ -7,7 +7,7 @@ import torch import transformers -from transformers import AutoConfig, AutoModelForImageTextToText, AutoProcessor, TextStreamer +from transformers import AutoConfig, AutoProcessor from QEfficient import QEFFAutoModelForImageTextToText @@ -16,12 +16,14 @@ # For Testing Purpose Only config.text_config.num_hidden_layers = 1 config.vision_config.num_hidden_layers = 2 - -model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager", config=config) -model.eval() tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_id) -qeff_model = QEFFAutoModelForImageTextToText(model, kv_offload=True) + +# pass HF_TOKEN if gated model +# For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ### +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, config=config, attn_implementation="eager", kv_offload=True +) ### use skip_vision=Ture, if want to run only text, or false ### skip_vision = True @@ -59,9 +61,7 @@ return_tensors="pt", ) - streamer = TextStreamer(tokenizer) - output = qeff_model.generate(inputs=inputs, device_ids=[0], generation_len=100) - print(output.generated_ids) + output = qeff_model.generate(inputs=inputs, generation_len=100) print(tokenizer.batch_decode(output.generated_ids)) print(output) @@ -72,7 +72,7 @@ ctx_len=3072, img_size=896, num_cores=16, - num_devices=8, + num_devices=1, mxfp6_matmul=False, mxint8_kv_cache=False, aic_enable_depth_first=True, @@ -103,9 +103,6 @@ return_tensors="pt", ) inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - streamer = TextStreamer(tokenizer) - output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3], generation_len=100) - print(output.generated_ids) + output = qeff_model.generate(inputs=inputs, generation_len=100) print(tokenizer.batch_decode(output.generated_ids)) print(output) - print() diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index c31491442..54f167281 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -88,29 +88,28 @@ "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", 4, ), - # FIX: Accuracy in AIC - # ( - # "google/gemma-3-4b-it", - # True, - # 1, - # 128, - # 3072, - # 896, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - # "Can you describe the image in detail.", - # 6, - # ), - # ( - # "google/gemma-3-4b-it", - # False, - # 1, - # 128, - # 3072, - # 896, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - # "Can you describe the image in detail.", - # 6, - # ), + ( + "google/gemma-3-4b-it", + True, + 1, + 128, + 3072, + 896, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "Can you describe the image in detail.", + 1, + ), + ( + "google/gemma-3-4b-it", + False, + 1, + 128, + 3072, + 896, + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "Can you describe the image in detail.", + 1, + ), # ( # "meta-llama/Llama-3.2-11B-Vision-Instruct", # True, From 77cfb29b27f42b3d3ea731be24f43b952245e3d4 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Fri, 27 Jun 2025 15:31:35 +0530 Subject: [PATCH 05/22] Bug fix for spdTransform (#467) Added fix for spdtransform due to change in hash --------- Signed-off-by: Dipankar Sarkar --- QEfficient/transformers/models/modeling_auto.py | 3 +-- QEfficient/transformers/models/pytorch_transforms.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 6bff10f5a..2f3ee3dc0 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1415,10 +1415,9 @@ def __init__( self.num_layers = model.config.num_hidden_layers self.continuous_batching = continuous_batching self.model.qaic_config = qaic_config - + self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) self.model, transformed = SpDTransform.apply(self.model, qaic_config, **kwargs) self.is_tlm = transformed - self.pretrained_model_name_or_path = kwargs.get("pretrained_model_name_or_path", None) # ---Sampling--- # Note: SamplerTransform should be applied after all other transforms diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 42807753d..ca74c0ddd 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -503,6 +503,7 @@ class SpDTransform: @classmethod def apply(cls, model: nn.Module, qaic_config: Optional[dict] = None, **kwargs) -> Tuple[nn.Module, bool]: transformed = False + pretrained_model_name_or_path_temp = kwargs.pop("pretrained_model_name_or_path", None) if qaic_config is None or (speculative_model_type := qaic_config.get("speculative_model_type")) is None: return model, transformed elif speculative_model_type not in ( @@ -524,6 +525,7 @@ def apply(cls, model: nn.Module, qaic_config: Optional[dict] = None, **kwargs) - raise NotImplementedError( f"model class {model_class} does not yet support returning multiple logits to keep." ) + kwargs["pretrained_model_name_or_path"] = pretrained_model_name_or_path_temp return model, transformed From 6c64d35ea95285802c1fb53906b5154be3ffe50e Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Tue, 1 Jul 2025 12:24:45 +0530 Subject: [PATCH 06/22] [QEff. Finetune]: Enabled FT CI tests. (#420) - Enabled CI tests for Finetuning. - Updated Jenkins file to install torch_qaic as it is required during FT tests. - Added finetune as a new pytest flag and updated other existing tests not to trigger for this flag. --------- Signed-off-by: meetkuma Co-authored-by: Meet Patel --- QEfficient/finetune/dataset/samsum_dataset.py | 2 +- scripts/Jenkinsfile | 32 +++-- tests/finetune/test_finetune.py | 109 +++++++++++++++--- 3 files changed, 119 insertions(+), 24 deletions(-) diff --git a/QEfficient/finetune/dataset/samsum_dataset.py b/QEfficient/finetune/dataset/samsum_dataset.py index 67726d731..f3f68140b 100644 --- a/QEfficient/finetune/dataset/samsum_dataset.py +++ b/QEfficient/finetune/dataset/samsum_dataset.py @@ -9,7 +9,7 @@ def get_preprocessed_samsum(dataset_config, tokenizer, split, context_length=None): - dataset = datasets.load_dataset("Samsung/samsum", split=split, trust_remote_code=True) + dataset = datasets.load_dataset("knkarthick/samsum", split=split, trust_remote_code=True) prompt = "Summarize this dialog:\n{dialog}\n---\nSummary:\n" diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index e6a69d5fb..103c04b73 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -25,6 +25,7 @@ pipeline { pip install junitparser pytest-xdist && pip install librosa==0.10.2 soundfile==0.13.1 && #packages needed to load example for whisper testing pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.19.1+cpu einops==0.8.1 && #packages to load VLMs + pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl && # For finetuning tests rm -rf QEfficient" ''' } @@ -41,7 +42,7 @@ pipeline { mkdir -p $PWD/Non_cli_qaic && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_cli_qaic && - pytest tests -m '(not cli) and (not on_qaic)' --ignore tests/vllm -n auto --junitxml=tests/tests_log1.xml && + pytest tests -m '(not cli) and (not on_qaic) and (not finetune)' --ignore tests/vllm -n auto --junitxml=tests/tests_log1.xml && junitparser merge tests/tests_log1.xml tests/tests_log.xml && deactivate" ''' @@ -58,7 +59,7 @@ pipeline { mkdir -p $PWD/Non_qaic && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_qaic && - pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml && + pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml && junitparser merge tests/tests_log2.xml tests/tests_log.xml && deactivate" ''' @@ -77,14 +78,14 @@ pipeline { mkdir -p $PWD/Non_cli_qaic_multimodal && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_cli_qaic_multimodal && - pytest tests -m '(not cli) and (on_qaic) and (multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log6.xml && + pytest tests -m '(not cli) and (on_qaic) and (multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log6.xml && junitparser merge tests/tests_log6.xml tests/tests_log.xml && deactivate" ''' } } } - stage('CLI Tests') { + stage('Inference Tests') { steps { timeout(time: 60, unit: 'MINUTES') { sh ''' @@ -96,7 +97,7 @@ pipeline { mkdir -p $PWD/cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/cli && - pytest tests -m '(cli and not qnn)' --ignore tests/vllm --junitxml=tests/tests_log3.xml && + pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log3.xml && junitparser merge tests/tests_log3.xml tests/tests_log.xml && deactivate" ''' @@ -125,7 +126,7 @@ pipeline { mkdir -p $PWD/Qnn_cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Qnn_cli && - pytest tests -m '(cli and qnn)' --ignore tests/vllm --junitxml=tests/tests_log4.xml && + pytest tests -m '(cli and qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log4.xml && junitparser merge tests/tests_log4.xml tests/tests_log.xml && deactivate" ''' @@ -144,7 +145,7 @@ pipeline { mkdir -p $PWD/Qnn_non_cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Qnn_non_cli && - pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml && + pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log5.xml && junitparser merge tests/tests_log5.xml tests/tests_log.xml && deactivate" ''' @@ -170,6 +171,23 @@ pipeline { } } } + stage('Finetune CLI Tests') { + steps { + timeout(time: 5, unit: 'MINUTES') { + sh ''' + sudo docker exec ${BUILD_TAG} bash -c " + cd /efficient-transformers && + . preflight_qeff/bin/activate && + mkdir -p $PWD/cli_qaic_finetuning && + export TOKENIZERS_PARALLELISM=false && + export QEFF_HOME=$PWD/cli_qaic_finetuning && + pytest tests -m '(cli) and (on_qaic) and (not qnn) and (not multimodal) and (finetune)' --ignore tests/vllm --junitxml=tests/tests_log_finetune.xml && + junitparser merge tests/tests_log_finetune.xml tests/tests_log.xml && + deactivate" + ''' + } + } + } } post { diff --git a/tests/finetune/test_finetune.py b/tests/finetune/test_finetune.py index dbff66fd4..89a4d2498 100644 --- a/tests/finetune/test_finetune.py +++ b/tests/finetune/test_finetune.py @@ -7,9 +7,11 @@ import os import shutil +from pathlib import Path import numpy as np import pytest +import requests import torch.optim as optim from torch.utils.data import DataLoader @@ -17,61 +19,125 @@ import QEfficient.cloud.finetune from QEfficient.cloud.finetune import main as finetune +alpaca_json_path = Path.cwd() / "alpaca_data.json" + def clean_up(path): - if os.path.exists(path): + if os.path.isdir(path) and os.path.exists(path): shutil.rmtree(path) + if os.path.isfile(path): + os.remove(path) + + +def download_alpaca(): + alpaca_url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json" + response = requests.get(alpaca_url) + + with open(alpaca_json_path, "wb") as f: + f.write(response.content) configs = [ pytest.param( "meta-llama/Llama-3.2-1B", # model_name + "generation", # task_type 10, # max_eval_step 20, # max_train_step + "gsm8k_dataset", # dataset_name + None, # data_path 1, # intermediate_step_save None, # context_length True, # run_validation True, # use_peft "qaic", # device - id="llama_config", # config name - ) + 0.0043353, # expected_train_loss + 1.0043447, # expected_train_metric + 0.0117334, # expected_eval_loss + 1.0118025, # expected_eval_metric + id="llama_config_gsm8k", # config name + ), + pytest.param( + "meta-llama/Llama-3.2-1B", # model_name + "generation", # task_type + 10, # max_eval_step + 20, # max_train_step + "alpaca_dataset", # dataset_name + alpaca_json_path, # data_path + 1, # intermediate_step_save + None, # context_length + True, # run_validation + True, # use_peft + "qaic", # device + 0.0006099, # expected_train_loss + 1.0006101, # expected_train_metric + 0.0065296, # expected_eval_loss + 1.0065510, # expected_eval_metric + id="llama_config_alpaca", # config name + ), + pytest.param( + "google-bert/bert-base-uncased", # model_name + "seq_classification", # task_type + 10, # max_eval_step + 20, # max_train_step + "imdb_dataset", # dataset_name + None, # data_path + 1, # intermediate_step_save + None, # context_length + True, # run_validation + False, # use_peft + "qaic", # device + 0.00052981, # expected_train_loss + 0.55554199, # expected_train_metric + 0.00738618, # expected_eval_loss + 0.70825195, # expected_eval_metric + id="bert_config_imdb", # config name + ), ] -@pytest.mark.skip(reason="Currently CI is broken. Once it is fixed we will enable this test.") @pytest.mark.cli @pytest.mark.on_qaic @pytest.mark.finetune @pytest.mark.parametrize( - "model_name,max_eval_step,max_train_step,intermediate_step_save,context_length,run_validation,use_peft,device", + "model_name,task_type,max_eval_step,max_train_step,dataset_name,data_path,intermediate_step_save,context_length,run_validation,use_peft,device,expected_train_loss,expected_train_metric,expected_eval_loss,expected_eval_metric", configs, ) -def test_finetune( +def test_finetune_llama( model_name, + task_type, max_eval_step, max_train_step, + dataset_name, + data_path, intermediate_step_save, context_length, run_validation, use_peft, device, + expected_train_loss, + expected_train_metric, + expected_eval_loss, + expected_eval_metric, mocker, ): train_config_spy = mocker.spy(QEfficient.cloud.finetune, "TrainConfig") generate_dataset_config_spy = mocker.spy(QEfficient.cloud.finetune, "generate_dataset_config") generate_peft_config_spy = mocker.spy(QEfficient.cloud.finetune, "generate_peft_config") - get_dataloader_kwargs_spy = mocker.spy(QEfficient.cloud.finetune, "get_dataloader_kwargs") + get_dataloader_kwargs_spy = mocker.spy(QEfficient.finetune.utils.dataset_utils, "get_dataloader_kwargs") update_config_spy = mocker.spy(QEfficient.cloud.finetune, "update_config") - get_custom_data_collator_spy = mocker.spy(QEfficient.cloud.finetune, "get_custom_data_collator") - get_preprocessed_dataset_spy = mocker.spy(QEfficient.cloud.finetune, "get_preprocessed_dataset") + get_custom_data_collator_spy = mocker.spy(QEfficient.finetune.utils.dataset_utils, "get_custom_data_collator") + get_preprocessed_dataset_spy = mocker.spy(QEfficient.finetune.utils.dataset_utils, "get_preprocessed_dataset") get_longest_seq_length_spy = mocker.spy(QEfficient.cloud.finetune, "get_longest_seq_length") print_model_size_spy = mocker.spy(QEfficient.cloud.finetune, "print_model_size") train_spy = mocker.spy(QEfficient.cloud.finetune, "train") kwargs = { "model_name": model_name, + "task_type": task_type, "max_eval_step": max_eval_step, "max_train_step": max_train_step, + "dataset": dataset_name, + "data_path": data_path, "intermediate_step_save": intermediate_step_save, "context_length": context_length, "run_validation": run_validation, @@ -79,22 +145,26 @@ def test_finetune( "device": device, } + if dataset_name == "alpaca_dataset": + download_alpaca() + results = finetune(**kwargs) - assert np.allclose(results["avg_train_loss"], 0.00232327, atol=1e-5), "Train loss is not matching." - assert np.allclose(results["avg_train_metric"], 1.002326, atol=1e-5), "Train metric is not matching." - assert np.allclose(results["avg_eval_loss"], 0.0206124, atol=1e-5), "Eval loss is not matching." - assert np.allclose(results["avg_eval_metric"], 1.020826, atol=1e-5), "Eval metric is not matching." + assert np.allclose(results["avg_train_loss"], expected_train_loss, atol=1e-3), "Train loss is not matching." + assert np.allclose(results["avg_train_metric"], expected_train_metric, atol=1e-3), "Train metric is not matching." + assert np.allclose(results["avg_eval_loss"], expected_eval_loss, atol=1e-3), "Eval loss is not matching." + assert np.allclose(results["avg_eval_metric"], expected_eval_metric, atol=1e-3), "Eval metric is not matching." assert results["avg_epoch_time"] < 60, "Training should complete within 60 seconds." train_config_spy.assert_called_once() generate_dataset_config_spy.assert_called_once() - generate_peft_config_spy.assert_called_once() - get_custom_data_collator_spy.assert_called_once() + if task_type == "generation": + generate_peft_config_spy.assert_called_once() get_longest_seq_length_spy.assert_called_once() print_model_size_spy.assert_called_once() train_spy.assert_called_once() assert update_config_spy.call_count == 2 + assert get_custom_data_collator_spy.call_count == 2 assert get_dataloader_kwargs_spy.call_count == 2 assert get_preprocessed_dataset_spy.call_count == 2 @@ -123,12 +193,19 @@ def test_finetune( f"{train_config.gradient_accumulation_steps} which is gradient accumulation steps." ) - saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/adapter_model.safetensors") + if use_peft: + saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/adapter_model.safetensors") + else: + saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/model.safetensors") assert os.path.isfile(saved_file) clean_up(train_config.output_dir) clean_up("runs") + clean_up("qaic-dumps") clean_up(train_config.dump_root_dir) + if dataset_name == "alpaca_dataset": + clean_up(alpaca_json_path) + # TODO (Meet): Add seperate tests for BERT FT and LLama FT From 10fb2ac945a935262a0c9ee39e1f49eaa8aa7b07 Mon Sep 17 00:00:00 2001 From: quic-akuruvil Date: Tue, 1 Jul 2025 13:43:25 +0530 Subject: [PATCH 07/22] Gemma 3 minor fixes (#476) - CPR (#484) CI enablement and other minor fixes for Gemma3 --------- --------- Signed-off-by: Ann Kuruvilla Signed-off-by: Dipankar Sarkar Co-authored-by: Dipankar Sarkar From 71e554f59622b48a2c611ca4dd7038d1ebfc0784 Mon Sep 17 00:00:00 2001 From: Hem Agnihotri Date: Tue, 1 Jul 2025 13:48:40 +0530 Subject: [PATCH 08/22] Revert "Gemma 3 minor fixes (#476) - CPR" (#485) Reverts quic/efficient-transformers#484 From d823503bcaa8f0f2032d8985b163f237fd303795 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Wed, 2 Jul 2025 16:21:18 +0530 Subject: [PATCH 09/22] [Docs/Readme]: Main Readme updating for latest news and adding the onboarded features in docs (#423) This PR is created for updating the readme and docs for adding the latest features added in this release. --------- Signed-off-by: Abukhoyer Shaik --- README.md | 19 +++++++++++++++---- docs/source/introduction.md | 30 +++++++++++++++++++++++------- docs/source/quick_start.md | 9 ++++++++- docs/source/validate.md | 18 +++++++++++------- 4 files changed, 57 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 2edb65797..85d0a18d1 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,24 @@ --- *Latest news* :fire:
+ - [06/2025] Added support for Llama4 Multi-Model [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) - [06/2025] Added support for Gemma3 Multi-Modal-Model [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it) - [06/2025] Added support of model `hpcai-tech/grok-1` [hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1) +- [06/2025] Added support for sentence embedding which improves efficiency, Flexible/Custom Pooling configuration and compilation with multiple sequence lengths, [Embedding model](https://github.com/quic/efficient-transformers/pull/424). + +
+More + +- [04/2025] Added support for [Granite Vision models](https://huggingface.co/collections/ibm-granite/granite-vision-models-67b3bd4ff90c915ba4cd2800) +- [04/2025] Added support for [Granite MOE models](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-base) +- [04/2025] Support for [SpD, multiprojection heads](https://quic.github.io/efficient-transformers/source/quick_start.html#draft-based-speculative-decoding). Implemented post-attention hidden size projections to speculate tokens ahead of the base model +- [04/2025] [QNN Compilation support](https://github.com/quic/efficient-transformers/pull/374) for AutoModel classes. QNN compilation capabilities for multi-models, embedding models and causal models. +- [04/2025] Added support for separate prefill and decode compilation for encoder (vision) and language models. This feature will be utilized for [disaggregated serving](https://github.com/quic/efficient-transformers/pull/365). +- [04/2025] SwiftKV Support for both [continuous and non-continuous batching execution](https://github.com/quic/efficient-transformers/pull/367) in SwiftKV. +- [04/2025] Support for [GGUF model execution](https://github.com/quic/efficient-transformers/pull/368) (without quantized weights) +- [04/2025] Enabled FP8 model support on [replicate_kv_heads script](https://github.com/quic/efficient-transformers/tree/main/scripts/replicate_kv_head) +- [04/2025] Added support for [gradient checkpointing](https://github.com/quic/efficient-transformers/pull/338) in the finetuning script - [04/2025] Added support of model `ibm-granite/granite-vision-3.2-2b`[ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) - [03/2025] Added support for swiftkv model [Snowflake/Llama-3.1-SwiftKV-8B-Instruct](https://huggingface.co/Snowflake/Llama-3.1-SwiftKV-8B-Instruct) - [02/2025] [VLMs support](https://github.com/quic/efficient-transformers/pull/267) added for the models [InternVL-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B), [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [Mllama](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) @@ -18,10 +33,6 @@ - [11/2024] [finite adapters support](https://github.com/quic/efficient-transformers/pull/153) allows mixed adapter usage for peft models. - [11/2024] [Speculative decoding TLM](https://github.com/quic/efficient-transformers/pull/119) QEFFAutoModelForCausalLM model can be compiled for returning more than 1 logits during decode for TLM. - [11/2024] Added support for [Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct), [Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and [Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B) - -
-More -- [04/2025] [Granite 3.0 and 3.1 Language MOE Models] (https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-base) - [09/2024] [AWQ](https://arxiv.org/abs/2306.00978)/[GPTQ](https://arxiv.org/abs/2210.17323) 4-bit quantized models are supported
- [09/2024] Now we support [PEFT](https://huggingface.co/docs/peft/index) models - [01/2025] Added support for [Ibm-Granite] (https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) diff --git a/docs/source/introduction.md b/docs/source/introduction.md index d842b40c4..7a2e3fd02 100644 --- a/docs/source/introduction.md +++ b/docs/source/introduction.md @@ -23,19 +23,35 @@ For other models, there is comprehensive documentation to inspire upon the chang ***Latest news*** :
- [coming soon] Support for more popular [models](models_coming_soon)
+- [06/2025] Added support for Llama4 Multi-Model [meta-llama/Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) +- [06/2025] Added support for Gemma3 Multi-Modal-Model [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it) +- [06/2025] Added support of model `hpcai-tech/grok-1` [hpcai-tech/grok-1](https://huggingface.co/hpcai-tech/grok-1) +- [06/2025] Added support for sentence embedding which improves efficiency, Flexible/Custom Pooling configuration and compilation with multiple sequence lengths, [Embedding model](https://github.com/quic/efficient-transformers/pull/424). + +
+More + +- [04/2025] Added support for [Granite Vision models](https://huggingface.co/collections/ibm-granite/granite-vision-models-67b3bd4ff90c915ba4cd2800) +- [04/2025] Added support for [Granite MOE models](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-base) +- [04/2025] Support for [SpD, multiprojection heads](https://quic.github.io/efficient-transformers/source/quick_start.html#draft-based-speculative-decoding). Implemented post-attention hidden size projections to speculate tokens ahead of the base model +- [04/2025] [QNN Compilation support](https://github.com/quic/efficient-transformers/pull/374) for AutoModel classes. QNN compilation capabilities for multi-models, embedding models and causal models. +- [04/2025] Added support for separate prefill and decode compilation for encoder (vision) and language models. This feature will be utilized for [disaggregated serving](https://github.com/quic/efficient-transformers/pull/365). +- [04/2025] SwiftKV Support for both [continuous and non-continuous batching execution](https://github.com/quic/efficient-transformers/pull/367) in SwiftKV. +- [04/2025] Support for [GGUF model execution](https://github.com/quic/efficient-transformers/pull/368) (without quantized weights) +- [04/2025] Enabled FP8 model support on [replicate_kv_heads script](https://github.com/quic/efficient-transformers/tree/main/scripts/replicate_kv_head) +- [04/2025] Added support for [gradient checkpointing](https://github.com/quic/efficient-transformers/pull/338) in the finetuning script +- [03/2025] Added support for swiftkv model [Snowflake/Llama-3.1-SwiftKV-8B-Instruct](https://huggingface.co/Snowflake/Llama-3.1-SwiftKV-8B-Instruct) +- [02/2025] [VLMs support](https://github.com/quic/efficient-transformers/pull/267) added for the models [InternVL-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B), [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [Mllama](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) - [01/2025] [FP8 models support](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127) Added support for inference of FP8 models. -- [01/2025] Added support for [Ibm-Granite](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) +- [01/2025] Added support for [Ibm-Granite] (https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) - [11/2024] [finite adapters support](https://github.com/quic/efficient-transformers/pull/153) allows mixed adapter usage for peft models. - [11/2024] [Speculative decoding TLM](https://github.com/quic/efficient-transformers/pull/119) QEFFAutoModelForCausalLM model can be compiled for returning more than 1 logits during decode for TLM. - [11/2024] Added support for [Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct), [Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and [Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B) -- [09/2024] [AWQ](https://arxiv.org/abs/2306.00978)/[GPTQ](https://arxiv.org/abs/2210.17323) 4-bit quantized models are supported +- [09/2024] [AWQ](https://arxiv.org/abs/2306.00978)/[GPTQ](https://arxiv.org/abs/2210.17323) 4-bit quantized models are supported
- [09/2024] Now we support [PEFT](https://huggingface.co/docs/peft/index) models -
-More - -- [01/2025] Added support for [Ibm-Granite](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) -- [01/2025] Added support for [Ibm-Granite-Guardian](https://huggingface.co/ibm-granite/granite-guardian-3.1-8b) +- [01/2025] Added support for [Ibm-Granite] (https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) +- [01/2025] Added support for [Ibm-Granite-Guardian] (https://huggingface.co/ibm-granite/granite-guardian-3.1-8b) - [09/2024] Added support for [Gemma-2-Family](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
- [09/2024] Added support for [CodeGemma-Family](https://huggingface.co/collections/google/codegemma-release-66152ac7b683e2667abdee11) - [09/2024] Added support for [Gemma-Family](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b) diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index abab4cfc3..3896a616d 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -14,8 +14,15 @@ To achieve this, we have 2 levels of APIs, with different levels of abstraction. | Feature | Impact | | --- | --- | | Context Length Specializations (upcoming) | Increases the maximum context length that models can handle, allowing for better performance on tasks requiring long sequences of text. | -| Swift KV [Snowflake/Llama-3.1-SwiftKV-8B-Instruct] | Reduces computational overhead during inference by optimizing key-value pair processing, leading to improved throughput. | | Block Attention (in progress) | Reduces inference latency and computational cost by dividing context into blocks and reusing key-value states, particularly useful in RAG. | +| Sentence embedding, Flexible Pooling configuration and compilation with multiple sequence lengths| Supports standard/custom pooling with AI 100 acceleration and sentence embedding. Enables efficient sentence embeddings via Efficient-Transformers. Compile with one or multiple seq_len; optimal graph auto-selected at runtime. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/embedding_model.py) for more **details**.| +| [SpD, multiprojection heads](https://quic.github.io/efficient-transformers/source/quick_start.html#draft-based-speculative-decoding) | Implemented post-attention hidden size projections to speculate tokens ahead of the base model. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/multiprojs_spd_inference.py) for more **details**.| +| [QNN Compilation support](https://github.com/quic/efficient-transformers/pull/374) | Enabled for AutoModel classes QNN compilation capabilities for multi-models, embedding models and causal models.| +| [Disaggregated serving](https://github.com/quic/efficient-transformers/pull/365) | It support for separate prefill and decode compilation for encoder (vision) and language models.| +| [GGUF model execution](https://github.com/quic/efficient-transformers/pull/368) | Supported GGUF model execution (without quantized weights). Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/basic_gguf_models.py) for more **details**. | +| Replication of KV | Enabled FP8 model support on [replicate_kv_heads script](https://github.com/quic/efficient-transformers/tree/main/scripts/replicate_kv_head).| +| [gradient checkpointing](https://github.com/quic/efficient-transformers/pull/338) | Supports gradient checkpointing in the finetuning script| +| Swift KV [Snowflake/Llama-3.1-SwiftKV-8B-Instruct](https://huggingface.co/Snowflake/Llama-3.1-SwiftKV-8B-Instruct) | Reduces computational overhead during inference by optimizing key-value pair processing, leading to improved throughput. Support for both [continuous and non-continuous batching execution](https://github.com/quic/efficient-transformers/pull/367) in SwiftKV | | [Vision Language Model](QEFFAutoModelForImageTextToText) | Provides support for the AutoModelForImageTextToText class from the transformers library, enabling advanced vision-language tasks. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/image_text_to_text_inference.py) for more **details**. | | [Speech Sequence to Sequence Model](QEFFAutoModelForSpeechSeq2Seq) | Provides support for the QEFFAutoModelForSpeechSeq2Seq Facilitates speech-to-text sequence models. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/speech_to_text/run_whisper_speech_to_text.py) for more **details**. | | Support for FP8 Execution | Enables execution with FP8 precision, significantly improving performance and reducing memory usage for computational tasks. | diff --git a/docs/source/validate.md b/docs/source/validate.md index c10d68daf..5c3ce2b24 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -17,6 +17,8 @@ | **GPT2LMHeadModel** | GPT-2 | [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) | ✔️ | | **GraniteForCausalLM** | Granite 3.1 | [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
[ibm-granite/granite-guardian-3.1-8b](https://huggingface.co/ibm-granite/granite-guardian-3.1-8b) | ✔️ | | | Granite 20B | [ibm-granite/granite-20b-code-base-8k](https://huggingface.co/ibm-granite/granite-20b-code-base-8k)
[ibm-granite/granite-20b-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k) | ✔️ | +| **GraniteMoeForCausalLM** | Granite 3.0 | [ibm-granite/granite-3.0-1b-a400m-base](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-base) | ✔️ | +| | Granite 3.1 | [ibm-granite/granite-3.1-1b-a400m-base](https://huggingface.co/ibm-granite/granite-3.0-1b-a400m-base) | ✔️ | | **InternVLChatModel** | Intern-VL | [OpenGVLab/InternVL2_5-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B) | | | **LlamaForCausalLM** | CodeLlama | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf)
[codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf)
[codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | ✔️ | | | DeepSeek-R1-Distill-Llama | [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | ✔️ | @@ -57,13 +59,13 @@ ### Vision-Language Models (Text + Image Generation) **QEff Auto Class:** `QEFFAutoModelForImageTextToText` -| Architecture | Model Family | Representative Models | -|-----------------------------|--------------|----------------------------------------| -| **LlavaForConditionalGeneration** | LLaVA-1.5 | [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | -| **MllamaForConditionalGeneration** | Llama 3.2 | [meta-llama/Llama-3.2-11B-Vision Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision) | -|**LlavaNextForConditionalGeneration** | Granite Vision | [ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) -|**Llama4ForConditionalGeneration** | Llama-4-Scout | [Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) -|**Gemma3ForConditionalGeneration** | Gemma3 | [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it) +| Architecture | Model Family | Representative Models | CB Support | Single Qpc Support | Dual Qpc Support | +|-----------------------------|--------------|----------------------------------------------------------------------------------------|------------|--------------------|------------------| +| **LlavaForConditionalGeneration** | LLaVA-1.5 | [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | ✕ | ✔️ | ✔️ | +| **MllamaForConditionalGeneration** | Llama 3.2 | [meta-llama/Llama-3.2-11B-Vision Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision) | ✕ | ✔️ | ✔️ | +|**LlavaNextForConditionalGeneration** | Granite Vision | [ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) | ✕ | ✕ | ✔️ | +|**Llama4ForConditionalGeneration** | Llama-4-Scout | [Llama-4-Scout-17B-16E-Instruct](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct) | ✕ | ✔️ | ✔️ | +|**Gemma3ForConditionalGeneration** | Gemma3 | [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it)| ✕ | ✔️ | ✔️ | ### Audio Models (Automatic Speech Recognition) - Transcription Task @@ -78,6 +80,8 @@ | Architecture | Model Family | Representative Models | |-------------------------|--------------|--------------------------------------------| +| **Qwen3MoeForCausalLM** |Qwen3| [Qwen/Qwen3-MoE-15B-A2B]() | +| **Mistral3ForConditionalGeneration**|Mistral 3.1| [mistralai/Mistral-Small-3.1-24B-Base-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) | | **BaichuanForCausalLM** | Baichuan2 | [baichuan-inc/Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base) | | **CohereForCausalLM** | Command-R | [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) | | **DbrxForCausalLM** | DBRX | [databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base) | \ No newline at end of file From c5a5c178e3a445f9b8ced2ad2a6ed56ca5d83e2a Mon Sep 17 00:00:00 2001 From: Dhiraj Kumar Sah Date: Thu, 3 Jul 2025 11:27:30 +0530 Subject: [PATCH 10/22] QUICKFIX: Removed the redundant breakpoint comment in modeling_llava_next file. (#475) Signed-off-by: Dhiraj Kumar Sah --- QEfficient/exporter/export_hf_to_cloud_ai_100.py | 1 - QEfficient/exporter/export_utils.py | 4 ---- .../models/codegen/modeling_codegen.py | 1 - .../models/gpt_bigcode/modeling_gpt_bigcode.py | 2 -- .../models/internvl/modeling_internvl.py | 1 - .../transformers/models/llama4/modeling_llama4.py | 14 +++++++++++--- .../models/llava_next/modeling_llava_next.py | 1 - 7 files changed, 11 insertions(+), 13 deletions(-) diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index 1a0a04fc3..b769680ef 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -129,7 +129,6 @@ def export_bertstyle_model_to_onnx(model_name, model, tokenizer, onnx_dir_path, ) # Generate inputFiles - # todo(ochougul):rename to bert_style_input_list.txt input_list_file = os.path.join(onnx_dir_path, "input_list.txt") generate_input_files( input_files_path=os.path.join(onnx_dir_path, "inputFiles"), diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py index 11bb1e7bb..f86a0f254 100644 --- a/QEfficient/exporter/export_utils.py +++ b/QEfficient/exporter/export_utils.py @@ -218,8 +218,6 @@ def fix_onnx_fp16( :str: Updated base name of exported ONNX model. """ model = onnx.load(os.path.join(gen_models_path, f"{model_base_name}.onnx")) - # TODO: Remove this `fix_onnx_fp16` function and replace with this transform - # as we're not utilizing the validations done in this function model, fp16_fix = FP16ClipTransform.apply(model, onnx_base_dir=gen_models_path) if fp16_fix: @@ -256,8 +254,6 @@ def fix_onnx_fp16( if ort_outputs is not None: for oname, orto, ortof in zip(output_names, ort_outputs, ort_outputs_fixed): fix_diff = np.abs(orto.astype(np.float32) - ortof.astype(np.float32)).max() - # TODO: need to the debug this - # info(oname, fix_diff) close_outputs.append(fix_diff < 1e-5) else: info("No constants out of FP16 range") diff --git a/QEfficient/transformers/models/codegen/modeling_codegen.py b/QEfficient/transformers/models/codegen/modeling_codegen.py index 09400c51e..e0f6b5196 100644 --- a/QEfficient/transformers/models/codegen/modeling_codegen.py +++ b/QEfficient/transformers/models/codegen/modeling_codegen.py @@ -85,7 +85,6 @@ def forward( Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]], ]: qkv = self.qkv_proj(hidden_states) - # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic mp_num = 4 qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1)) diff --git a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index d4a322a56..5dd9362ee 100644 --- a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -29,8 +29,6 @@ # Fused kernels # Use separate functions for each case because conditionals prevent kernel fusion. -# TODO: Could have better fused kernels depending on scaling, dropout and head mask. -# Is it doable without writing 32 functions? @torch.jit.script def upcast_masked_softmax( x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 13f0eae7c..b6fb9fd38 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -66,7 +66,6 @@ def get_specializations( kv_offload: bool = False, **compiler_options, ): - # TODO: check if this should be named num_patches or something else num_patches = compiler_options.pop("num_patches", None) if num_patches is None: logger.warning( diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index 6b30c7804..ffcec4451 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -312,8 +312,10 @@ def __init__(self, config: Llama4TextConfig, device=None): self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] # self.max_seq_len_cached = config.max_position_embeddings - # TODO: vbaddi Shouldn't for rope, the max posision_embeddings be original embeddings for rope, - # chunk size 8192 always? and Revisit when >8K Chunked attention is enabled. + # TODO: max sequence length cached should be taken before export and model should be exported with that paramter. + logger.warning( + f"max_seq_len_cached is set to {constants.LLAMA4_MAX_POSITION_EMBEDDINGS}, this is the maximum sequence length supported for the model" + ) self.max_seq_len_cached = constants.LLAMA4_MAX_POSITION_EMBEDDINGS # Get inverse frequency and scaling function (handles yarn/etc) @@ -883,7 +885,6 @@ def get_specializations( kv_offload: bool = False, **compiler_options, ): - # TODO: check if this should be named num_patches or something else max_num_tiles = compiler_options.pop("max_num_tiles", None) if max_num_tiles is None: logger.warning( @@ -901,6 +902,13 @@ def get_specializations( else constants.LLAMA4_ATTENTION_CHUNK_SIZE ), ) + if ( + prefill_seq_len > constants.LLAMA4_MAX_POSITION_EMBEDDINGS + or ctx_len > constants.LLAMA4_MAX_POSITION_EMBEDDINGS + ): + raise ValueError( + f"max_seq_len_cached is set to {constants.LLAMA4_MAX_POSITION_EMBEDDINGS}, Your prefill_seq_len is {prefill_seq_len} and ctx_len is {ctx_len}." + ) if img_size is None and hasattr(self.config.vision_config, "image_size"): img_size = getattr(self.config.vision_config, "image_size") diff --git a/QEfficient/transformers/models/llava_next/modeling_llava_next.py b/QEfficient/transformers/models/llava_next/modeling_llava_next.py index 338d141f8..23434fc18 100755 --- a/QEfficient/transformers/models/llava_next/modeling_llava_next.py +++ b/QEfficient/transformers/models/llava_next/modeling_llava_next.py @@ -123,7 +123,6 @@ def __init__(self, model): def forward(self, input_ids, vision_embeds, position_ids, image_idx, past_key_values): inputs_embeds = self.model.get_input_embeddings()(input_ids) image_features = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype) - # breakpoint() mask = input_ids == self.config.image_token_index indices1 = mask.to(torch.int64).cumsum(1) - 1 indices1 = torch.where(indices1 != -1, indices1 + image_idx, indices1) From b90c1acad1ef3bf197c69665d623775e3637bcb7 Mon Sep 17 00:00:00 2001 From: Rishin Raj Date: Thu, 3 Jul 2025 13:50:50 +0530 Subject: [PATCH 11/22] MDP hash support (#479) Signed-off-by: Rishin Raj --- QEfficient/base/modeling_qeff.py | 65 +++++++++++++++----------------- QEfficient/utils/__init__.py | 3 ++ QEfficient/utils/_utils.py | 26 +++++++++++++ QEfficient/utils/constants.py | 4 ++ 4 files changed, 64 insertions(+), 34 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 1aafb1ba2..d9d6823ae 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -7,7 +7,6 @@ import hashlib import inspect -import json import logging import shutil import subprocess @@ -23,7 +22,7 @@ from QEfficient.base.pytorch_transforms import PytorchTransform from QEfficient.compile.qnn_compiler import compile as qnn_compile from QEfficient.generation.cloud_infer import QAICInferenceSession -from QEfficient.utils import constants, dump_qconfig +from QEfficient.utils import constants, create_json, dump_qconfig, generate_mdp_partition_config, load_json from QEfficient.utils.cache import QEFF_HOME, to_hashable logger = logging.getLogger(__name__) @@ -269,8 +268,8 @@ def _compile( specializations=specializations, custom_io=custom_io, device_group=list(range(mdp_ts_num_devices)), - num_cores=compiler_options.get("aic_num_cores", 16), - mxfp6=compiler_options.get("mxfp6_matmul", False), + num_cores=compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES), + mxfp6=compiler_options.get("mxfp6_matmul", constants.DEFAULT_AIC_MXPF6_MATMUL), mxint8=mxint8_kv_cache, qnn_config=qnn_config, ) @@ -278,8 +277,8 @@ def _compile( return self.qpc_path command = constants.COMPILER + [f"-m={onnx_path}"] - if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None): - mdp_ts_num_devices = None + + if mdp_ts_json_path := compiler_options.pop("mdp_load_partition_config", None): command.append(f"-mdp-load-partition-config={mdp_ts_json_path}") for key, value in compiler_options.items(): @@ -289,6 +288,17 @@ def _compile( command.append(option) continue command.append(f"{option}={value}") + + # Create a dummy mdp_ts_json if mdp-load-partition-config not provided and num_devices > 1 + if mdp_ts_json_path is not None: + mdp_ts_json = load_json(str(mdp_ts_json_path)) + elif mdp_ts_num_devices > 1: + mdp_ts_json = generate_mdp_partition_config( + mdp_ts_num_devices, compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES) + ) + else: + mdp_ts_json = None + compile_hash = hashlib.sha256(to_hashable(command)) if specializations is not None: @@ -299,14 +309,17 @@ def _compile( if num_speculative_tokens: compile_hash.update(to_hashable({"num_speculative_tokens": num_speculative_tokens})) - # Hash num_devices too, since default value would always be 1. - compile_hash.update(to_hashable(mdp_ts_num_devices)) + + # Hash the MDP partition config and the number of devices. + compile_hash.update(to_hashable(mdp_ts_json)) + compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices})) # Check if already compiled compile_hash = compile_hash.hexdigest()[:16] compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash) qpc_path = compile_dir / "qpc" qpc_path.mkdir(parents=True, exist_ok=True) + if qpc_path.is_dir(): if (qpc_path / "programqpc.bin").is_file(): self.qpc_path = qpc_path @@ -314,15 +327,19 @@ def _compile( # Probably compilation failure last time, delete directory to start over shutil.rmtree(qpc_path) + # write the MDP partition config file if not provided + if mdp_ts_json is not None: + mdp_ts_json_path = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json" + create_json(str(mdp_ts_json_path), mdp_ts_json) + command.append(f"-mdp-load-partition-config={mdp_ts_json_path}") + # Write specializations.json file if specializations is not None: specializations_json = compile_dir / "specializations.json" - with open(specializations_json, "w") as fp: - json.dump( - {"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]}, - fp, - indent=4, - ) + specializations_data = { + "specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations] + } + create_json(str(specializations_json), specializations_data) command.append(f"-network-specialization-config={specializations_json}") # Write custom_io.yaml file @@ -333,26 +350,6 @@ def _compile( fp.write(f" - IOName: {io_name}\n Precision: {dtype}\n\n") command.append(f"-custom-IO-list-file={custom_io_yaml}") - # Write mdp_config.json file - if not mdp_ts_json_path and mdp_ts_num_devices > 1: - num_cores = compiler_options.get("aic_num_cores", 16) - mdp_ts_json = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json" - with open(mdp_ts_json, "w") as fp: - json.dump( - { - "connections": [{"devices": list(range(mdp_ts_num_devices)), "type": "p2p"}], - "partitions": [ - { - "name": "Partition0", - "devices": [{"deviceId": d, "numCores": num_cores} for d in range(mdp_ts_num_devices)], - } - ], - }, - fp, - indent=4, - ) - command.append(f"-mdp-load-partition-config={mdp_ts_json}") - command.append(f"-aic-binary-dir={qpc_path}") logger.info(f"Running compiler: {' '.join(command)}") try: diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 7fc132b17..03fbff078 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -11,8 +11,10 @@ ) from QEfficient.utils._utils import ( # noqa: F401 check_and_assign_cache_dir, + create_json, custom_format_warning, dump_qconfig, + generate_mdp_partition_config, get_num_layers_from_config, get_num_layers_vlm, get_onnx_dir_name, @@ -24,6 +26,7 @@ hf_download, load_hf_processor, load_hf_tokenizer, + load_json, login_and_download_hf_lm, onnx_exists, padding_check_and_fix, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 106647bc0..b2a35c005 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -568,6 +568,32 @@ def create_json(file_path: str, json_data: object): print(f"Failed to create JSON File {file_path}: {e}") +def generate_mdp_partition_config(num_devices: int, num_cores: int) -> str: + """ + Generates an MDP partition configuration JSON file using the create_json utility. + + Args: + num_devices (int): Number of devices. + num_cores (int): Number of cores per device. + output_dir (str): Directory where the JSON file will be saved. + + Returns: + str: Path to the generated JSON file. + """ + + mdp_config = { + "connections": [{"devices": list(range(num_devices)), "type": "p2p"}], + "partitions": [ + { + "name": "Partition0", + "devices": [{"deviceId": d, "numCores": num_cores} for d in range(num_devices)], + } + ], + } + + return mdp_config + + def model_swap(func): def wrapper(*args, **kwargs): if "model" in kwargs and kwargs["model"] is not None: diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 526b01683..5e855094c 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -25,6 +25,10 @@ ONNX_EXPORT_IMAGE_DEPTH = 3 ONNX_EXPORT_CTX_LEN = 1024 +# Compiler defaults +DEFAULT_AIC_NUM_CORES = 16 +DEFAULT_AIC_MXPF6_MATMUL = False + # Store the qeff_models inside the ~/.cache directory or over-ride with an env variable. def get_models_dir(): From db38927062fbbbe0543e59016f358ee149466331 Mon Sep 17 00:00:00 2001 From: Swati Allabadi Date: Fri, 4 Jul 2025 23:29:28 +0530 Subject: [PATCH 12/22] [QEff Finetune] Adding dataset padding changes (#478) Padding the dataset with dummy samples (they won't contribute in total_loss) to make the #samples a multiple of degree of ddp*batch_size) in case of 1) Fine tuning through DDP 2) train_batch_size > 1 or val_batch_size > 0 --------- Signed-off-by: Swati Allabadi Co-authored-by: Swati Allabadi Co-authored-by: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> --- QEfficient/finetune/data/sampler.py | 16 +++-- QEfficient/finetune/utils/dataset_utils.py | 40 +++++++++-- QEfficient/finetune/utils/helper.py | 5 ++ QEfficient/finetune/utils/train_utils.py | 81 +++++++++++++++++++--- tests/finetune/test_finetune.py | 22 +++--- 5 files changed, 133 insertions(+), 31 deletions(-) diff --git a/QEfficient/finetune/data/sampler.py b/QEfficient/finetune/data/sampler.py index 1a4115419..60f789cbc 100644 --- a/QEfficient/finetune/data/sampler.py +++ b/QEfficient/finetune/data/sampler.py @@ -4,11 +4,9 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- - import random from itertools import islice -import numpy as np import torch @@ -22,14 +20,14 @@ def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool self.batch_size = batch_size self.drop_last = drop_last self.shuffle = shuffle + self.data_source = data_source def __iter__(self): - ids = np.argsort(self.lengths, kind="mergesort") + ids = list(range(len(self.data_source))) if self.drop_last: ids = ids[: len(ids) // self.batch_size * self.batch_size] batches = [ids[i : i + self.batch_size] for i in range(0, len(ids), self.batch_size)] - if self.shuffle: random.shuffle(batches) @@ -45,11 +43,17 @@ def __len__(self): class DistributedLengthBasedBatchSampler(torch.utils.data.BatchSampler): def __init__( - self, data_source, batch_size: int, num_replicas: int, rank: int, shuffle: bool = True, seed: int = 0 + self, + data_source, + batch_size: int, + num_replicas: int, + rank: int, + shuffle: bool = True, + seed: int = 0, ) -> None: random.seed(seed) self.batch_sampler = LengthBasedBatchSampler( - data_source, batch_size=batch_size, drop_last=True, shuffle=shuffle + data_source, batch_size=batch_size, drop_last=False, shuffle=shuffle ) self.num_replicas = num_replicas self.rank = rank diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py index 42d0aae71..a0f7d19cd 100644 --- a/QEfficient/finetune/utils/dataset_utils.py +++ b/QEfficient/finetune/utils/dataset_utils.py @@ -4,13 +4,14 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- - +import datasets import torch import torch.distributed as dist from transformers.data import DataCollatorForSeq2Seq from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler from QEfficient.finetune.dataset.dataset_config import DATALOADER_COLLATE_FUNC, DATASET_PREPROC +from QEfficient.finetune.utils.helper import get_num_ddp_devices def get_preprocessed_dataset( @@ -54,27 +55,58 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split): dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False ) kwargs["batch_size"] = batch_size - kwargs["drop_last"] = True + kwargs["drop_last"] = False else: kwargs["batch_size"] = batch_size - kwargs["drop_last"] = True + kwargs["drop_last"] = False kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer) return kwargs +def padding_dataset(train_config, dataset, batch_size): + if train_config.enable_ddp and train_config.enable_sorting_for_ddp: + if isinstance(dataset, datasets.Dataset): + # Hugging Face Dataset transformation + dataset = dataset.map(lambda x: {"input_length": len(x["input_ids"])}) + dataset = dataset.sort("input_length") + + else: + dataset = sorted(dataset, key=lambda x: len(x["input_ids"])) + + dummy_row = next(iter(dataset)) + dummy_row["labels"] = torch.tensor([-100] * len(dummy_row["labels"])) + padding_size = 0 + num_replicas = get_num_ddp_devices() + remainder = len(dataset) % (num_replicas * batch_size) + padding_size = (num_replicas * batch_size) - remainder + + dummy_data = [dummy_row.copy() for _ in range(padding_size)] + dummy_dataset = datasets.Dataset.from_list(dummy_data) + if isinstance(dataset, datasets.Dataset): + combined_dataset = datasets.concatenate_datasets([dataset, dummy_dataset]) + else: + combined_dataset = dataset + list(dummy_dataset) + return combined_dataset + + def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"): dataset = get_preprocessed_dataset(tokenizer, dataset_config, split, context_length=train_config.context_length) + + batch_size = train_config.train_batch_size if split == "train" else train_config.val_batch_size + dataset = padding_dataset(train_config, dataset, batch_size) + dl_kwargs = get_dataloader_kwargs(train_config, dataset, tokenizer, split) # FIXME (Meet): Add custom data collator registration from the outside by the user. custom_data_collator = get_custom_data_collator(tokenizer, dataset_config) + if custom_data_collator: print("custom_data_collator is used") dl_kwargs["collate_fn"] = custom_data_collator print(f"length of dataset_{split}", len(dataset)) - # Create data loader + dataloader = torch.utils.data.DataLoader( dataset, num_workers=train_config.num_workers_dataloader, diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py index fcc44fec8..8562b2aed 100644 --- a/QEfficient/finetune/utils/helper.py +++ b/QEfficient/finetune/utils/helper.py @@ -4,8 +4,13 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- +import os TASK_TYPE = ["generation", "seq_classification"] PEFT_METHOD = ["lora"] DEVICE = ["qaic", "cpu", "cuda"] BATCHING_STRATEGY = ["padding", "packing"] + + +def get_num_ddp_devices(): + return int(os.getenv("WORLD_SIZE", 1)) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 9f9f06917..f513ba5c4 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -151,7 +151,7 @@ def train( # enable profile for qaic qaic_profile.start_profiling(device, 1) if train_config.use_profiler else None - + num_dummy_samples = 0 for step, batch in enumerate(train_dataloader): # resume training from a particular checkpoint, assuming the dataset is not shuffled if train_config.use_peft and train_config.from_peft_checkpoint: @@ -192,6 +192,17 @@ def train( ) as verifier: model_outputs = model(**batch) loss = model_outputs.loss # Forward call + if (batch["labels"] != -100).sum() == 0: + loss = loss.nan_to_num(nan=0.0) + num_dummy_samples += train_config.train_batch_size + else: + num_dummy_samples_per_batch = ( + (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item() + ) + if num_dummy_samples_per_batch > 0: + num_dummy_samples += num_dummy_samples_per_batch + loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch + if train_config.task_type == "seq_classification": logits = model_outputs.logits labels = batch["labels"][:, 0] @@ -201,6 +212,17 @@ def train( else: model_outputs = model(**batch) loss = model_outputs.loss # Forward call + if (batch["labels"] != -100).sum() == 0: + loss = loss.nan_to_num(nan=0.0) + num_dummy_samples += train_config.train_batch_size + else: + num_dummy_samples_per_batch = ( + (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item() + ) + if num_dummy_samples_per_batch > 0: + num_dummy_samples += num_dummy_samples_per_batch + loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch + if train_config.task_type == "seq_classification": logits = model_outputs.logits labels = batch["labels"][:, 0] @@ -208,8 +230,7 @@ def train( acc_helper.forward(preds, labels) total_loss += loss.detach().float() - # Accumalate gradients - loss = loss / train_config.gradient_accumulation_steps + if train_config.enable_ddp: if local_rank == 0: if loss <= train_config.convergence_loss: @@ -237,6 +258,17 @@ def train( step_metric_val = float(torch.exp(loss.detach().float())) train_step_metric.append(step_metric_val) + # Accumalate gradients + complete_accum_steps = ( + len(train_dataloader) - len(train_dataloader) % train_config.gradient_accumulation_steps + ) + if step < complete_accum_steps: + num_samples_in_cur_update = train_config.gradient_accumulation_steps + else: + num_samples_in_cur_update = len(train_dataloader) % train_config.gradient_accumulation_steps + + loss = loss / num_samples_in_cur_update + if train_config.grad_scaler: scaler.scale(loss).backward() # backward pass else: @@ -296,15 +328,30 @@ def train( if loss_0_counter.item() == train_config.convergence_counter: if train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch: - train_epoch_loss = total_loss / (step - intermediate_step) + train_epoch_loss = ( + 0.0 + if total_loss == 0.0 + else total_loss / (step - intermediate_step - num_dummy_samples / train_config.train_batch_size) + ) else: - train_epoch_loss = total_loss / step + train_epoch_loss = ( + 0.0 + if total_loss == 0.0 + else total_loss / (step + 1 - num_dummy_samples / train_config.train_batch_size) + ) else: if train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch: - train_epoch_loss = total_loss / (len(train_dataloader) - intermediate_step) + train_epoch_loss = ( + 0.0 + if total_loss == 0.0 + else total_loss / (step - intermediate_step - (num_dummy_samples / train_config.train_batch_size)) + ) else: - train_epoch_loss = total_loss / len(train_dataloader) - + train_epoch_loss = ( + 0.0 + if total_loss == 0.0 + else total_loss / (step + 1 - (num_dummy_samples / train_config.train_batch_size)) + ) if train_config.task_type == "seq_classification": metric_val = acc_helper.compute() acc_helper.reset() @@ -389,7 +436,6 @@ def train( results["avg_checkpoint_time"] = avg_checkpoint_time if train_config.save_metrics: results["metrics_filename"] = metrics_filename - return results @@ -421,6 +467,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): eval_loss = 0.0 # Initialize evaluation loss device_type = torch.device(device).type + num_dummy_samples = 0 for step, batch in enumerate(tqdm(eval_dataloader, colour="green", desc="evaluating Epoch", dynamic_ncols=True)): # stop when the maximum number of eval steps is reached if train_config.max_eval_step > 0 and step > train_config.max_eval_step: @@ -439,6 +486,17 @@ def evaluation_helper(model, train_config, eval_dataloader, device): outputs = model(**batch) loss = outputs.loss + if (batch["labels"] != -100).sum() == 0: + loss = loss.nan_to_num(nan=0.0) + num_dummy_samples += 1 + else: + num_dummy_samples_per_batch = ( + (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item() + ) + if num_dummy_samples_per_batch > 0: + num_dummy_samples += num_dummy_samples_per_batch + loss = loss * train_config.val_batch_size / num_dummy_samples_per_batch + if train_config.task_type == "seq_classification": logits = outputs.logits labels = batch["labels"][:, 0] @@ -453,9 +511,10 @@ def evaluation_helper(model, train_config, eval_dataloader, device): val_step_metric.append(metric_val) eval_loss += loss.detach().float() - # Compute average loss and metric - eval_epoch_loss = eval_loss / len(eval_dataloader) + eval_epoch_loss = ( + 0.0 if eval_loss == 0.0 else eval_loss / (step + 1 - num_dummy_samples / train_config.val_batch_size) + ) if train_config.task_type == "seq_classification": eval_metric = acc_helper.compute() else: diff --git a/tests/finetune/test_finetune.py b/tests/finetune/test_finetune.py index 89a4d2498..b376234e5 100644 --- a/tests/finetune/test_finetune.py +++ b/tests/finetune/test_finetune.py @@ -50,10 +50,10 @@ def download_alpaca(): True, # run_validation True, # use_peft "qaic", # device - 0.0043353, # expected_train_loss - 1.0043447, # expected_train_metric - 0.0117334, # expected_eval_loss - 1.0118025, # expected_eval_metric + 1.5427961, # expected_train_loss + 4.6776514, # expected_train_metric + 1.2898713, # expected_eval_loss + 3.6323189, # expected_eval_metric id="llama_config_gsm8k", # config name ), pytest.param( @@ -68,10 +68,10 @@ def download_alpaca(): True, # run_validation True, # use_peft "qaic", # device - 0.0006099, # expected_train_loss - 1.0006101, # expected_train_metric - 0.0065296, # expected_eval_loss - 1.0065510, # expected_eval_metric + 1.4348667, # expected_train_loss + 4.1990857, # expected_train_metric + 1.5941212, # expected_eval_loss + 4.9239997, # expected_eval_metric id="llama_config_alpaca", # config name ), pytest.param( @@ -86,15 +86,16 @@ def download_alpaca(): True, # run_validation False, # use_peft "qaic", # device - 0.00052981, # expected_train_loss + 0.63060283, # expected_train_loss 0.55554199, # expected_train_metric - 0.00738618, # expected_eval_loss + 0.61503016, # expected_eval_loss 0.70825195, # expected_eval_metric id="bert_config_imdb", # config name ), ] +@pytest.mark.skip() # remove when it's clear why diff val_step_loss values are observed in diff runs on existing code (even without PR #478 changes) @pytest.mark.cli @pytest.mark.on_qaic @pytest.mark.finetune @@ -149,6 +150,7 @@ def test_finetune_llama( download_alpaca() results = finetune(**kwargs) + assert np.allclose(results["avg_train_loss"], expected_train_loss, atol=1e-3), "Train loss is not matching." assert np.allclose(results["avg_train_metric"], expected_train_metric, atol=1e-3), "Train metric is not matching." assert np.allclose(results["avg_eval_loss"], expected_eval_loss, atol=1e-3), "Eval loss is not matching." From 6254efe790fd74db1262be6d0aa5f1200dab9fe2 Mon Sep 17 00:00:00 2001 From: Shubham Agrawal Date: Mon, 7 Jul 2025 11:18:06 +0530 Subject: [PATCH 13/22] Fixed QNN data format config issue. (#480) Generating data format config file fails for encoder onnx graph without past key or past value. Fixed a coding bug in the function. --------- Signed-off-by: Shubham Agrawal --- ...erate_qnn_network_specialization_config.py | 2 +- docs/source/quick_start.md | 24 ++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py index 14d83efda..eca8e1873 100644 --- a/QEfficient/utils/generate_qnn_network_specialization_config.py +++ b/QEfficient/utils/generate_qnn_network_specialization_config.py @@ -166,8 +166,8 @@ def generate_data_format_config( for output in onnx_model.graph.output: if "past_key" in output.name or "past_value" in output.name: kv_nodes.append(output.name) - kv_overrides = {} + kv_overrides = {} kv_overrides["graphs"] = [ { "graph_name": model_dlc_name + "_configuration_1", diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 3896a616d..233fb491a 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -94,7 +94,7 @@ python -m QEfficient.cloud.execute --model_name gpt2 --qpc_path qeff_models/gpt2 You can run the finetune with set of predefined existing datasets on QAIC using the eager pipeline ```bash -python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./meta-sam --num_epochs 2 --context_length 256 +python -m QEfficient.cloud.finetune --device qaic:0 --use-peft --output_dir ./meta-sam --num_epochs 2 --context_length 256 ``` For more details on finetune, checkout the subsection. @@ -138,6 +138,28 @@ Users can compile a model with QNN SDK by following the steps below: * Enabled QNN by passing enable_qnn flag, add --enable_qnn in the cli command. * An optional config file can be passed to override the default parameters. +**Default Parameters** + +QNN Converter Stage: + + "--float_bias_bitwidth 32 --float_bitwidth 16 --preserve_io_datatype --onnx_skip_simplification --target_backend AIC" + +QNN Context Binary Stage: + + LOG_LEVEL = "error" + COMPILER_COMPILATION_TARGET = "hardware" + COMPILER_CONVERT_TO_FP16 = True + COMPILER_DO_DDR_TO_MULTICAST = True + COMPILER_HARDWARE_VERSION = "2.0" + COMPILER_PERF_WARNINGS = False + COMPILER_PRINT_DDR_STATS = False + COMPILER_PRINT_PERF_METRICS = False + COMPILER_RETAINED_STATE = True + COMPILER_STAT_LEVEL = 10 + COMPILER_STATS_BATCH_SIZE = 1 + COMPILER_TIME_PASSES = False + + **CLI Inference Command** Without QNN Config From 2ba491d713a65e6090d4ce603c907dc5c33a994a Mon Sep 17 00:00:00 2001 From: asmigosw Date: Wed, 9 Jul 2025 14:10:44 +0530 Subject: [PATCH 14/22] Corrected Total Inference Time unit (#505) Changed Total (E2E) inference time from decode/sec to sec. Signed-off-by: Asmita Goswami --- QEfficient/generation/text_generation_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index a9690aa51..fd7ef03ff 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -60,7 +60,7 @@ def __repr__(self): return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\ \nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} tokens/sec\ \nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} tokens/sec\ - \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} tokens/sec" + \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} sec" @dataclass From 3aaa2d8986adc74c3e23c103b9830962e75e75a1 Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Wed, 9 Jul 2025 15:47:25 +0530 Subject: [PATCH 15/22] [QEff. Finetune]: Added support to sync gradients across devices during optimizer step only. (#477) Disabling gradient is necessary when using gradient_accumulation_step > 1 with ddp enabled. Currently, we are syncing gradient at every loss.backward() call, which is called at all steps. When using gradient accumulation, the weight update during opt.step() step. Only during that step, the gradients across each devices should sync with each other. with model.no_sync() --> context manager solves this issue. Here, we are not using it but instead setting ddp_model.require_backward_grad_sync to True or False depending on which step we are. --------- Signed-off-by: Meet Patel Signed-off-by: meetkuma --- QEfficient/finetune/utils/helper.py | 40 ++++++++++ QEfficient/finetune/utils/train_utils.py | 97 ++++++++++-------------- 2 files changed, 78 insertions(+), 59 deletions(-) diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py index 8562b2aed..9e55a16ff 100644 --- a/QEfficient/finetune/utils/helper.py +++ b/QEfficient/finetune/utils/helper.py @@ -5,6 +5,15 @@ # # ----------------------------------------------------------------------------- import os +from contextlib import nullcontext + +import torch + +try: + import torch_qaic.debug as qaic_debug # noqa: F401 +except ImportError as e: + print(f"Warning: {e}. Moving ahead without these qaic modules.") + TASK_TYPE = ["generation", "seq_classification"] PEFT_METHOD = ["lora"] @@ -14,3 +23,34 @@ def get_num_ddp_devices(): return int(os.getenv("WORLD_SIZE", 1)) + + +def get_autocast_ctx(use_autocast, device_type, dtype=torch.float16): + return torch.autocast(device_type=device_type, dtype=dtype) if use_autocast else nullcontext() + + +def get_op_verifier_ctx( + use_op_by_op_verifier, + train_device, + dump_dir, + step, + ref_device="cpu", + ref_dtype=torch.float32, + atol=1e-1, + rtol=1e-5, + use_ref_output_on_mismatch=True, +): + if not use_op_by_op_verifier: + return nullcontext() + + filter_config = qaic_debug.DispatchFilterConfig.default(train_device) + dump_dir = dump_dir + "_" + str(step) + return qaic_debug.OpByOpVerifierMode( + ref_device=ref_device, + ref_dtype=ref_dtype, + atol=atol, + rtol=rtol, + use_ref_output_on_mismatch=use_ref_output_on_mismatch, + filter_config=filter_config, + dump_root_dir=dump_dir, + ) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index f513ba5c4..6eb44dc43 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -8,8 +8,8 @@ import json import os import time -from contextlib import nullcontext from datetime import datetime +from functools import partial from typing import Dict, List, Tuple import torch @@ -19,6 +19,7 @@ from tqdm import tqdm from QEfficient.finetune.configs.training import TrainConfig +from QEfficient.finetune.utils.helper import get_autocast_ctx, get_op_verifier_ctx try: import torch_qaic # noqa: F401 @@ -110,6 +111,9 @@ def train( num_classes = model.classifier.out_features acc_helper = torchmetrics.classification.MulticlassAccuracy(num_classes=num_classes).to(device) + autocast_ctx = get_autocast_ctx(train_config.use_autocast, device_type, dtype=torch.float16) + op_verifier_ctx = partial(get_op_verifier_ctx, train_config.opByOpVerifier, device, train_config.dump_root_dir) + # Start the training loop for epoch in range(train_config.num_epochs): if loss_0_counter.item() == train_config.convergence_counter: @@ -174,60 +178,38 @@ def train( break batch = {k: v.to(device) for k, v in batch.items()} # move the batch elements to qaic device - with ( - torch.autocast(device_type=device_type, dtype=torch.float16) - if train_config.use_autocast - else nullcontext() - ): - # an additional condition can be put here to avoid opByOpVerifier getting triggered for each step - if train_config.opByOpVerifier: - with qaic_debug.OpByOpVerifierMode( - ref_device="cpu", - ref_dtype=torch.float32, - # adjust atol & rtol this as required - atol=1e-1, - use_ref_output_on_mismatch=True, - filter_config=qaic_debug.DispatchFilterConfig.default(device), - dump_root_dir=train_config.dump_root_dir + str(step), - ) as verifier: - model_outputs = model(**batch) - loss = model_outputs.loss # Forward call - if (batch["labels"] != -100).sum() == 0: - loss = loss.nan_to_num(nan=0.0) - num_dummy_samples += train_config.train_batch_size - else: - num_dummy_samples_per_batch = ( - (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item() - ) - if num_dummy_samples_per_batch > 0: - num_dummy_samples += num_dummy_samples_per_batch - loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch - - if train_config.task_type == "seq_classification": - logits = model_outputs.logits - labels = batch["labels"][:, 0] - preds = torch.nn.functional.softmax(logits, dim=-1) - acc_helper.forward(preds, labels) - print("Mismatches detected:", verifier.get_perop_mismatch_count()) + is_optimizer_step = (step + 1) % train_config.gradient_accumulation_steps == 0 or step == len( + train_dataloader + ) - 1 + if train_config.enable_ddp: + # Below block derived from : https://github.com/karpathy/nanoGPT/blob/93a43d9a5c22450bbf06e78da2cb6eeef084b717/train.py#L293 + # in DDP training we only need to sync gradients at the last micro step. + # the official way to do this is with model.no_sync() context manager, but + # using too many context managers may bloat the code and forces us to repeat code + # looking at the source of that context manager, it just toggles this variable + model.require_backward_grad_sync = is_optimizer_step + + with autocast_ctx, op_verifier_ctx(step) as verifier: + model_outputs = model(**batch) + loss = model_outputs.loss # Forward call + if (batch["labels"] != -100).sum() == 0: + loss = loss.nan_to_num(nan=0.0) + num_dummy_samples += train_config.train_batch_size else: - model_outputs = model(**batch) - loss = model_outputs.loss # Forward call - if (batch["labels"] != -100).sum() == 0: - loss = loss.nan_to_num(nan=0.0) - num_dummy_samples += train_config.train_batch_size - else: - num_dummy_samples_per_batch = ( - (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item() - ) - if num_dummy_samples_per_batch > 0: - num_dummy_samples += num_dummy_samples_per_batch - loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch + num_dummy_samples_per_batch = ( + (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item() + ) + if num_dummy_samples_per_batch > 0: + num_dummy_samples += num_dummy_samples_per_batch + loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch - if train_config.task_type == "seq_classification": - logits = model_outputs.logits - labels = batch["labels"][:, 0] - preds = torch.nn.functional.softmax(logits, dim=-1) - acc_helper.forward(preds, labels) + if train_config.task_type == "seq_classification": + logits = model_outputs.logits + labels = batch["labels"][:, 0] + preds = torch.nn.functional.softmax(logits, dim=-1) + acc_helper.forward(preds, labels) + if train_config.opByOpVerifier: + print("Mismatches detected:", verifier.get_perop_mismatch_count()) total_loss += loss.detach().float() @@ -274,7 +256,7 @@ def train( else: loss.backward() # backward pass - if (step + 1) % train_config.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: + if is_optimizer_step: if train_config.grad_scaler: scaler.step(optimizer) scaler.update() @@ -468,6 +450,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): device_type = torch.device(device).type num_dummy_samples = 0 + autocast_ctx = get_autocast_ctx(train_config.use_autocast, device_type, dtype=torch.float16) for step, batch in enumerate(tqdm(eval_dataloader, colour="green", desc="evaluating Epoch", dynamic_ncols=True)): # stop when the maximum number of eval steps is reached if train_config.max_eval_step > 0 and step > train_config.max_eval_step: @@ -478,11 +461,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): # Ensure no gradients are computed for this scope to save memory with torch.no_grad(): # Forward pass and compute loss - with ( - torch.autocast(device_type=device_type, dtype=torch.float16) - if train_config.use_autocast - else nullcontext() - ): + with autocast_ctx: outputs = model(**batch) loss = outputs.loss From 30d1579893995e71c42d5f1334f671f795bd85fb Mon Sep 17 00:00:00 2001 From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com> Date: Wed, 9 Jul 2025 19:46:22 +0530 Subject: [PATCH 16/22] [QEff Finetune]: Implement logger for finetuning and enable dumping (#371) 1. Implement logger for finetuning 2. enable dumping logs by given flag --------- Signed-off-by: Mamta Singh Co-authored-by: Mamta Singh --- QEfficient/cloud/finetune.py | 48 ++++--- QEfficient/finetune/configs/training.py | 5 +- QEfficient/finetune/dataset/alpaca_dataset.py | 10 +- QEfficient/finetune/dataset/custom_dataset.py | 28 ++-- .../finetune/dataset/grammar_dataset.py | 17 ++- QEfficient/finetune/eval.py | 20 ++- QEfficient/finetune/utils/config_utils.py | 30 +++-- QEfficient/finetune/utils/dataset_utils.py | 12 +- QEfficient/finetune/utils/helper.py | 6 +- QEfficient/finetune/utils/logging_utils.py | 54 ++++++++ QEfficient/finetune/utils/parser.py | 14 +- QEfficient/finetune/utils/plot_metrics.py | 8 +- QEfficient/finetune/utils/train_utils.py | 123 ++++++++---------- 13 files changed, 221 insertions(+), 154 deletions(-) create mode 100644 QEfficient/finetune/utils/logging_utils.py diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index 1e0dc48bc..63fe2106a 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import logging import random import warnings from typing import Any, Dict, Optional, Union @@ -17,7 +18,7 @@ import torch.utils.data from peft import PeftModel, get_peft_model from torch.optim.lr_scheduler import StepLR -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer from QEfficient.finetune.configs.training import TrainConfig from QEfficient.finetune.utils.config_utils import ( @@ -26,18 +27,22 @@ update_config, ) from QEfficient.finetune.utils.dataset_utils import get_dataloader +from QEfficient.finetune.utils.logging_utils import logger from QEfficient.finetune.utils.parser import get_finetune_parser -from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train -from QEfficient.utils._utils import login_and_download_hf_lm +from QEfficient.finetune.utils.train_utils import ( + get_longest_seq_length, + print_model_size, + print_trainable_parameters, + train, +) +from QEfficient.utils._utils import hf_download # Try importing QAIC-specific module, proceed without it if unavailable try: import torch_qaic # noqa: F401 except ImportError as e: - print(f"Warning: {e}. Proceeding without QAIC modules.") - + logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.", logging.WARNING) -from transformers import AutoModelForSequenceClassification # Suppress all warnings warnings.filterwarnings("ignore") @@ -106,7 +111,8 @@ def load_model_and_tokenizer( - Resizes model embeddings if tokenizer vocab size exceeds model embedding size. - Sets pad_token_id to eos_token_id if not defined in the tokenizer. """ - pretrained_model_path = login_and_download_hf_lm(train_config.model_name) + logger.log_rank_zero(f"Loading HuggingFace model for {train_config.model_name}") + pretrained_model_path = hf_download(train_config.model_name) if train_config.task_type == "seq_classification": model = AutoModelForSequenceClassification.from_pretrained( pretrained_model_path, @@ -116,7 +122,7 @@ def load_model_and_tokenizer( ) if not hasattr(model, "base_model_prefix"): - raise RuntimeError("Given huggingface model does not have 'base_model_prefix' attribute.") + logger.raise_error("Given huggingface model does not have 'base_model_prefix' attribute.", RuntimeError) for param in getattr(model, model.base_model_prefix).parameters(): param.requires_grad = False @@ -141,11 +147,10 @@ def load_model_and_tokenizer( # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - print("WARNING: Resizing embedding matrix to match tokenizer vocab size.") + logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.", logging.WARNING) model.resize_token_embeddings(len(tokenizer)) - # FIXME (Meet): Cover below line inside the logger once it is implemented. - print_model_size(model, train_config) + print_model_size(model) # Note: Need to call this before calling PeftModel.from_pretrained or get_peft_model. # Because, both makes model.is_gradient_checkpointing = True which is used in peft library to @@ -157,7 +162,9 @@ def load_model_and_tokenizer( if hasattr(model, "supports_gradient_checkpointing") and model.supports_gradient_checkpointing: model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"preserve_rng_state": False}) else: - raise RuntimeError("Given model doesn't support gradient checkpointing. Please disable it and run it.") + logger.raise_error( + "Given model doesn't support gradient checkpointing. Please disable it and run it.", RuntimeError + ) model = apply_peft(model, train_config, peft_config_file, **kwargs) @@ -192,7 +199,7 @@ def apply_peft( else: peft_config = generate_peft_config(train_config, peft_config_file, **kwargs) model = get_peft_model(model, peft_config) - model.print_trainable_parameters() + print_trainable_parameters(model) return model @@ -217,7 +224,7 @@ def setup_dataloaders( - Length of longest sequence in the dataset. Raises: - ValueError: If validation is enabled but the validation set is too small. + RuntimeError: If validation is enabled but the validation set is too small. Notes: - Applies a custom data collator if provided by get_custom_data_collator. @@ -225,17 +232,18 @@ def setup_dataloaders( """ train_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="train") - print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}") + logger.log_rank_zero(f"Number of Training Set Batches loaded = {len(train_dataloader)}") eval_dataloader = None if train_config.run_validation: eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="val") if len(eval_dataloader) == 0: - raise ValueError( - f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" + logger.raise_error( + f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})", + ValueError, ) else: - print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}") longest_seq_length, _ = get_longest_seq_length( torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset]) @@ -274,13 +282,15 @@ def main(peft_config_file: str = None, **kwargs) -> None: dataset_config = generate_dataset_config(train_config.dataset) update_config(dataset_config, **kwargs) + logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level) + setup_distributed_training(train_config) setup_seeds(train_config.seed) model, tokenizer = load_model_and_tokenizer(train_config, dataset_config, peft_config_file, **kwargs) # Create DataLoaders for the training and validation dataset train_dataloader, eval_dataloader, longest_seq_length = setup_dataloaders(train_config, dataset_config, tokenizer) - print( + logger.log_rank_zero( f"The longest sequence length in the train data is {longest_seq_length}, " f"passed context length is {train_config.context_length} and overall model's context length is " f"{model.config.max_position_embeddings}" diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py index deac537bc..383d0e2b4 100644 --- a/QEfficient/finetune/configs/training.py +++ b/QEfficient/finetune/configs/training.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import logging from dataclasses import dataclass @@ -94,5 +95,7 @@ class TrainConfig: use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time. # profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler - dump_root_dir: str = "mismatches/step_" opByOpVerifier: bool = False + + dump_logs: bool = True + log_level: str = logging.INFO diff --git a/QEfficient/finetune/dataset/alpaca_dataset.py b/QEfficient/finetune/dataset/alpaca_dataset.py index aecc0d2cc..c6ddb6ce1 100644 --- a/QEfficient/finetune/dataset/alpaca_dataset.py +++ b/QEfficient/finetune/dataset/alpaca_dataset.py @@ -11,6 +11,8 @@ import torch from torch.utils.data import Dataset +from QEfficient.finetune.utils.logging_utils import logger + PROMPT_DICT = { "prompt_input": ( "Below is an instruction that describes a task, paired with an input that provides further context. " @@ -27,7 +29,13 @@ class InstructionDataset(Dataset): def __init__(self, dataset_config, tokenizer, partition="train", context_length=None): - self.ann = json.load(open(dataset_config.data_path)) + try: + self.ann = json.load(open(dataset_config.data_path)) + except FileNotFoundError: + logger.raise_error( + "Loading of alpaca dataset failed! Please use (wget -c https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json -P dataset/) to download the alpaca dataset.", + FileNotFoundError, + ) # Use 5% of the dataset for evaluation eval_length = int(len(self.ann) / 20) if partition == "train": diff --git a/QEfficient/finetune/dataset/custom_dataset.py b/QEfficient/finetune/dataset/custom_dataset.py index 6d9baf90d..4a1f500e3 100644 --- a/QEfficient/finetune/dataset/custom_dataset.py +++ b/QEfficient/finetune/dataset/custom_dataset.py @@ -8,6 +8,8 @@ import importlib from pathlib import Path +from QEfficient.finetune.utils.logging_utils import logger + def load_module_from_py_file(py_file: str) -> object: """ @@ -30,20 +32,22 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non module_path, func_name = dataset_config.file, "get_custom_dataset" if not module_path.endswith(".py"): - raise ValueError(f"Dataset file {module_path} is not a .py file.") + logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError) module_path = Path(module_path) if not module_path.is_file(): - raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") + logger.raise_error( + f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError + ) module = load_module_from_py_file(module_path.as_posix()) try: return getattr(module, func_name)(dataset_config, tokenizer, split, context_length) - except AttributeError as e: - print( - f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})." + except AttributeError: + logger.raise_error( + f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).", + AttributeError, ) - raise e def get_data_collator(dataset_processer, dataset_config): @@ -53,16 +57,20 @@ def get_data_collator(dataset_processer, dataset_config): module_path, func_name = dataset_config.file, "get_data_collator" if not module_path.endswith(".py"): - raise ValueError(f"Dataset file {module_path} is not a .py file.") + logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError) module_path = Path(module_path) if not module_path.is_file(): - raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.") + logger.raise_error( + f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError + ) module = load_module_from_py_file(module_path.as_posix()) try: return getattr(module, func_name)(dataset_processer) except AttributeError: - print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).") - print("Using the default data_collator instead.") + logger.log_rank_zero( + f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})." + ) + logger.log_rank_zero("Using the default data_collator instead.") return None diff --git a/QEfficient/finetune/dataset/grammar_dataset.py b/QEfficient/finetune/dataset/grammar_dataset.py index 43ee39158..e40c01e97 100644 --- a/QEfficient/finetune/dataset/grammar_dataset.py +++ b/QEfficient/finetune/dataset/grammar_dataset.py @@ -10,6 +10,8 @@ from datasets import load_dataset from torch.utils.data import Dataset +from QEfficient.finetune.utils.logging_utils import logger + class grammar(Dataset): def __init__(self, tokenizer, csv_name=None, context_length=None): @@ -19,11 +21,11 @@ def __init__(self, tokenizer, csv_name=None, context_length=None): data_files={"train": [csv_name]}, # "eval": "grammar_validation.csv"}, delimiter=",", ) - except Exception as e: - print( - "Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset." + except FileNotFoundError: + logger.raise_error( + "Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset.", + FileNotFoundError, ) - raise e self.context_length = context_length self.tokenizer = tokenizer @@ -36,7 +38,7 @@ def convert_to_features(self, example_batch): # Create prompt and tokenize contexts and questions if self.print_text: - print("Input Text: ", self.clean_text(example_batch["text"])) + logger.log_rank_zero("Input Text: ", self.clean_text(example_batch["text"])) input_ = example_batch["input"] target_ = example_batch["target"] @@ -71,9 +73,6 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None): """cover function for handling loading the working dataset""" """dataset loading""" currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv" - print(f"Loading dataset {currPath}") - csv_name = str(currPath) - print(csv_name) - dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length) + dataset = grammar(tokenizer=tokenizer, csv_name=str(currPath), context_length=context_length) return dataset diff --git a/QEfficient/finetune/eval.py b/QEfficient/finetune/eval.py index c0d29d38b..72407a91e 100644 --- a/QEfficient/finetune/eval.py +++ b/QEfficient/finetune/eval.py @@ -19,13 +19,14 @@ from utils.train_utils import evaluation, print_model_size from QEfficient.finetune.configs.training import TrainConfig +from QEfficient.finetune.utils.logging_utils import logger try: import torch_qaic # noqa: F401 device = "qaic:0" except ImportError as e: - print(f"Warning: {e}. Moving ahead without these qaic modules.") + logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Suppress all warnings @@ -77,25 +78,20 @@ def main(**kwargs): # If there is a mismatch between tokenizer vocab size and embedding matrix, # throw a warning and then expand the embedding matrix if len(tokenizer) > model.get_input_embeddings().weight.shape[0]: - print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.") + logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.") model.resize_token_embeddings(len(tokenizer)) - print_model_size(model, train_config) + print_model_size(model) if train_config.run_validation: - # TODO: vbaddi enable packing later in entire infra. - # if train_config.batching_strategy == "packing": - # dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length) - eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="test") - - print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") if len(eval_dataloader) == 0: - raise ValueError( - f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})" + logger.raise_error( + f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})", + ValueError, ) else: - print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}") + logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}") model.to(device) _ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device) diff --git a/QEfficient/finetune/utils/config_utils.py b/QEfficient/finetune/utils/config_utils.py index bdc3c0429..90c15cd7f 100644 --- a/QEfficient/finetune/utils/config_utils.py +++ b/QEfficient/finetune/utils/config_utils.py @@ -18,6 +18,7 @@ from QEfficient.finetune.configs.peft_config import LoraConfig from QEfficient.finetune.configs.training import TrainConfig from QEfficient.finetune.dataset.dataset_config import DATASET_PREPROC +from QEfficient.finetune.utils.logging_utils import logger def update_config(config, **kwargs): @@ -43,11 +44,12 @@ def update_config(config, **kwargs): if hasattr(config, param_name): setattr(config, param_name, v) else: - raise ValueError(f"Config '{config_name}' does not have parameter: '{param_name}'") + logger.raise_error( + f"Config '{config_name}' does not have parameter: '{param_name}'", ValueError + ) else: config_type = type(config).__name__ - # FIXME (Meet): Once logger is available put this in debug level. - print(f"[WARNING]: Unknown parameter '{k}' for config type '{config_type}'") + logger.debug(f"Unknown parameter '{k}' for config type '{config_type}'") def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None, **kwargs) -> Any: @@ -70,7 +72,7 @@ def generate_peft_config(train_config: TrainConfig, peft_config_file: str = None else: config_map = {"lora": (LoraConfig, PeftLoraConfig)} if train_config.peft_method not in config_map: - raise RuntimeError(f"Peft config not found: {train_config.peft_method}") + logger.raise_error(f"Peft config not found: {train_config.peft_method}", RuntimeError) config_cls, peft_config_cls = config_map[train_config.peft_method] if config_cls is None: @@ -119,7 +121,7 @@ def validate_config(config_data: Dict[str, Any], config_type: str = "lora") -> N - Ensures types match expected values (int, float, list, etc.). """ if config_type.lower() != "lora": - raise ValueError(f"Unsupported config_type: {config_type}. Only 'lora' is supported.") + logger.raise_error(f"Unsupported config_type: {config_type}. Only 'lora' is supported.", ValueError) required_fields = { "r": int, @@ -136,26 +138,28 @@ def validate_config(config_data: Dict[str, Any], config_type: str = "lora") -> N # Check for missing required fields missing_fields = [field for field in required_fields if field not in config_data] if missing_fields: - raise ValueError(f"Missing required fields in {config_type} config: {missing_fields}") + logger.raise_error(f"Missing required fields in {config_type} config: {missing_fields}", ValueError) # Validate types of required fields for field, expected_type in required_fields.items(): if not isinstance(config_data[field], expected_type): - raise ValueError( + logger.raise_error( f"Field '{field}' in {config_type} config must be of type {expected_type.__name__}, " - f"got {type(config_data[field]).__name__}" + f"got {type(config_data[field]).__name__}", + ValueError, ) # Validate target_modules contains strings if not all(isinstance(mod, str) for mod in config_data["target_modules"]): - raise ValueError("All elements in 'target_modules' must be strings") + logger.raise_error("All elements in 'target_modules' must be strings", ValueError) # Validate types of optional fields if present for field, expected_type in optional_fields.items(): if field in config_data and not isinstance(config_data[field], expected_type): - raise ValueError( + logger.raise_error( f"Field '{field}' in {config_type} config must be of type {expected_type.__name__}, " - f"got {type(config_data[field]).__name__}" + f"got {type(config_data[field]).__name__}", + ValueError, ) @@ -173,7 +177,7 @@ def load_config_file(config_path: str) -> Dict[str, Any]: ValueError: If the file format is unsupported. """ if not os.path.exists(config_path): - raise FileNotFoundError(f"Config file not found: {config_path}") + logger.raise_error(f"Config file not found: {config_path}", FileNotFoundError) with open(config_path, "r") as f: if config_path.endswith(".yaml") or config_path.endswith(".yml"): @@ -181,4 +185,4 @@ def load_config_file(config_path: str) -> Dict[str, Any]: elif config_path.endswith(".json"): return json.load(f) else: - raise ValueError("Unsupported config file format. Use .yaml, .yml, or .json") + logger.raise_error("Unsupported config file format. Use .yaml, .yml, or .json", ValueError) diff --git a/QEfficient/finetune/utils/dataset_utils.py b/QEfficient/finetune/utils/dataset_utils.py index a0f7d19cd..aacff2bb5 100644 --- a/QEfficient/finetune/utils/dataset_utils.py +++ b/QEfficient/finetune/utils/dataset_utils.py @@ -12,13 +12,14 @@ from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler from QEfficient.finetune.dataset.dataset_config import DATALOADER_COLLATE_FUNC, DATASET_PREPROC from QEfficient.finetune.utils.helper import get_num_ddp_devices +from QEfficient.finetune.utils.logging_utils import logger def get_preprocessed_dataset( tokenizer, dataset_config, split: str = "train", context_length: int = None ) -> torch.utils.data.Dataset: if dataset_config.dataset not in DATASET_PREPROC: - raise NotImplementedError(f"{dataset_config.dataset} is not (yet) implemented") + logger.raise_error(f"{dataset_config.dataset} is not (yet) implemented", NotImplementedError) def get_split(): return dataset_config.train_split if split == "train" else dataset_config.test_split @@ -39,8 +40,9 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split): if train_config.enable_ddp: if train_config.enable_sorting_for_ddp: if train_config.context_length: - raise ValueError( - "Sorting cannot be done with padding, Please disable sorting or pass context_length as None to disable padding" + logger.raise_error( + "Sorting cannot be done with padding, Please disable sorting or pass context_length as None to disable padding", + ValueError, ) else: kwargs["batch_sampler"] = DistributedLengthBasedBatchSampler( @@ -104,9 +106,9 @@ def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train" print("custom_data_collator is used") dl_kwargs["collate_fn"] = custom_data_collator - print(f"length of dataset_{split}", len(dataset)) - # Create data loader + logger.log_rank_zero(f"Length of {split} dataset is {len(dataset)}") + # Create data loader dataloader = torch.utils.data.DataLoader( dataset, num_workers=train_config.num_workers_dataloader, diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py index 9e55a16ff..e8a6d1ccb 100644 --- a/QEfficient/finetune/utils/helper.py +++ b/QEfficient/finetune/utils/helper.py @@ -21,6 +21,10 @@ BATCHING_STRATEGY = ["padding", "packing"] +def is_rank_zero(): + return int(os.getenv("LOCAL_RANK", 0)) == 0 + + def get_num_ddp_devices(): return int(os.getenv("WORLD_SIZE", 1)) @@ -44,7 +48,7 @@ def get_op_verifier_ctx( return nullcontext() filter_config = qaic_debug.DispatchFilterConfig.default(train_device) - dump_dir = dump_dir + "_" + str(step) + dump_dir = dump_dir + "/mismatches/step_" + str(step) return qaic_debug.OpByOpVerifierMode( ref_device=ref_device, ref_dtype=ref_dtype, diff --git a/QEfficient/finetune/utils/logging_utils.py b/QEfficient/finetune/utils/logging_utils.py new file mode 100644 index 000000000..15a67223f --- /dev/null +++ b/QEfficient/finetune/utils/logging_utils.py @@ -0,0 +1,54 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import logging +import os +from datetime import datetime + +from QEfficient.finetune.utils.helper import is_rank_zero + + +class FTLogger: + def __init__(self): + self.logger = logging.getLogger("QEfficient") + if not getattr(self.logger, "_custom_methods_added", False): + self._bind_custom_methods() + self.logger._custom_methods_added = True # Prevent adding handlers/methods twice + + def _bind_custom_methods(self): + def raise_error(message, errortype=RuntimeError): + self.logger.error(message) + raise errortype(message) + + def log_rank_zero(msg: str, level: int = logging.INFO): + if is_rank_zero(): + self.logger.log(level, msg, stacklevel=2) + + def prepare_for_logs(output_path, dump_logs=False, level=logging.INFO): + self.logger.setLevel(level) + if dump_logs: + logs_path = os.path.join(output_path, "logs") + if not os.path.exists(logs_path): + os.makedirs(logs_path, exist_ok=True) + file_name = f"log-file-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + ".txt" + log_file = os.path.join(logs_path, file_name) + + fh = logging.FileHandler(log_file) + fh.setLevel(level) + formatter = logging.Formatter("%(levelname)s - %(name)s - %(message)s") + fh.setFormatter(formatter) + self.logger.addHandler(fh) + + self.logger.raise_error = raise_error + self.logger.log_rank_zero = log_rank_zero + self.logger.prepare_for_logs = prepare_for_logs + + def get_logger(self): + return self.logger + + +logger = FTLogger().get_logger() diff --git a/QEfficient/finetune/utils/parser.py b/QEfficient/finetune/utils/parser.py index 39ce5f969..980f6a3b9 100644 --- a/QEfficient/finetune/utils/parser.py +++ b/QEfficient/finetune/utils/parser.py @@ -254,18 +254,14 @@ def get_finetune_parser(): action="store_true", help="Enable distributed data parallel training. This will load the replicas of model on given number of devices and train the model. This should be used using torchrun interface. Please check docs for exact usage.", ) - parser.add_argument( - "--dump_root_dir", - "--dump-root-dir", - required=False, - type=str, - default="mismatches/step_", - help="Directory for mismatch dumps by opByOpVerifier", - ) parser.add_argument( "--opByOpVerifier", action="store_true", - help="Enable operation-by-operation verification w.r.t reference device(cpu). It is a context manager interface that captures and verifies each operator against reference device. In case results of test & reference do not match under given tolerances, a standalone unittest is generated at dump_root_dir.", + help=argparse.SUPPRESS, + # This is for debugging purpose only. + # Enables operation-by-operation verification w.r.t reference device(cpu). + # It is a context manager interface that captures and verifies each operator against reference device. + # In case results of test & reference do not match under given tolerances, a standalone unittest is generated at dump_root_dir. ) return parser diff --git a/QEfficient/finetune/utils/plot_metrics.py b/QEfficient/finetune/utils/plot_metrics.py index 416ec3cdf..1e22bc6a8 100644 --- a/QEfficient/finetune/utils/plot_metrics.py +++ b/QEfficient/finetune/utils/plot_metrics.py @@ -11,6 +11,8 @@ import matplotlib.pyplot as plt +from QEfficient.finetune.utils.logging_utils import logger + def plot_metric(data, metric_name, x_label, y_label, title, colors): plt.figure(figsize=(7, 6)) @@ -67,14 +69,14 @@ def plot_metrics_by_step(data, metric_name, x_label, y_label, colors): def plot_metrics(file_path): if not os.path.exists(file_path): - print(f"File {file_path} does not exist.") + logger.raise_error(f"File {file_path} does not exist.", FileNotFoundError) return with open(file_path, "r") as f: try: data = json.load(f) - except json.JSONDecodeError: - print("Invalid JSON file.") + except json.JSONDecodeError as e: + logger.raise_error("Invalid JSON file.", e) return directory = os.path.dirname(file_path) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 6eb44dc43..03fb6b5a6 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -19,7 +19,8 @@ from tqdm import tqdm from QEfficient.finetune.configs.training import TrainConfig -from QEfficient.finetune.utils.helper import get_autocast_ctx, get_op_verifier_ctx +from QEfficient.finetune.utils.helper import get_autocast_ctx, get_op_verifier_ctx, is_rank_zero +from QEfficient.finetune.utils.logging_utils import logger try: import torch_qaic # noqa: F401 @@ -28,7 +29,7 @@ import torch_qaic.utils as qaic_utils # noqa: F401 from torch.qaic.amp import GradScaler as QAicGradScaler except ImportError as e: - print(f"Warning: {e}. Moving ahead without these qaic modules.") + logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.") from torch.amp import GradScaler @@ -84,11 +85,9 @@ def train( max_steps_reached = False # Flag to indicate max training steps reached tensorboard_updates = None - if train_config.enable_ddp: - if local_rank == 0: - tensorboard_updates = SummaryWriter() - else: - tensorboard_updates = SummaryWriter() + if is_rank_zero(): + tensorboard_log_dir = train_config.output_dir + "/runs/" + f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + tensorboard_updates = SummaryWriter(log_dir=tensorboard_log_dir) device_type = torch.device(device).type @@ -112,32 +111,26 @@ def train( acc_helper = torchmetrics.classification.MulticlassAccuracy(num_classes=num_classes).to(device) autocast_ctx = get_autocast_ctx(train_config.use_autocast, device_type, dtype=torch.float16) - op_verifier_ctx = partial(get_op_verifier_ctx, train_config.opByOpVerifier, device, train_config.dump_root_dir) + op_verifier_ctx = partial(get_op_verifier_ctx, train_config.opByOpVerifier, device, train_config.output_dir) # Start the training loop for epoch in range(train_config.num_epochs): if loss_0_counter.item() == train_config.convergence_counter: - if train_config.enable_ddp: - print( - f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." - ) - break - else: - print( - f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." - ) - break + logger.log_rank_zero( + f"Skipping epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps." + ) + break if train_config.use_peft and train_config.from_peft_checkpoint: intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1 if epoch < intermediate_epoch: - print(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") + logger.log_rank_zero(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.") # to bring the count of train_step in sync with where it left off total_train_steps += len(train_dataloader) continue - print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") - print(f"train_config.max_train_step: {train_config.max_train_step}") + logger.log_rank_zero(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") + logger.log_rank_zero(f"train_config.max_train_step: {train_config.max_train_step}") # stop when the maximum number of training steps is reached if max_steps_reached: break @@ -164,8 +157,8 @@ def train( # to bring the count of train_step in sync with where it left off if epoch == intermediate_epoch and step == 0: total_train_steps += intermediate_step - print( - f"skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for them." + logger.log_rank_zero( + f"Skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for it." ) if epoch == intermediate_epoch and step < intermediate_step: total_train_steps += 1 @@ -209,27 +202,17 @@ def train( preds = torch.nn.functional.softmax(logits, dim=-1) acc_helper.forward(preds, labels) if train_config.opByOpVerifier: - print("Mismatches detected:", verifier.get_perop_mismatch_count()) + logger.info("Mismatches detected:", verifier.get_perop_mismatch_count()) total_loss += loss.detach().float() - - if train_config.enable_ddp: - if local_rank == 0: - if loss <= train_config.convergence_loss: - loss_0_counter += 1 - else: - loss_0_counter = torch.tensor([0]).to(device) - dist.broadcast(loss_0_counter, src=0) - else: + if is_rank_zero(): if loss <= train_config.convergence_loss: loss_0_counter += 1 else: loss_0_counter = torch.tensor([0]).to(device) - if train_config.enable_ddp: - if local_rank == 0: - tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps) - else: + dist.broadcast(loss_0_counter, src=0) + if is_rank_zero(): tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps) if train_config.save_metrics: @@ -291,18 +274,11 @@ def train( val_step_metric, val_metric, ) - if train_config.enable_ddp: - if loss_0_counter.item() == train_config.convergence_counter: - print( - f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}." - ) - break - else: - if loss_0_counter.item() == train_config.convergence_counter: - print( - f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning." - ) - break + if loss_0_counter.item() == train_config.convergence_counter: + logger.log_rank_zero( + f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps.Hence,stopping the fine tuning." + ) + break pbar.close() epoch_end_time = time.perf_counter() - epoch_start_time @@ -347,18 +323,10 @@ def train( lr_scheduler.step() if train_config.run_validation: - if train_config.enable_ddp: - dist.barrier() - eval_epoch_loss, eval_metric, temp_val_loss, temp_step_metric = evaluation_helper( - model, train_config, eval_dataloader, device - ) - if local_rank == 0: - tensorboard_updates.add_scalars("loss", {"eval": eval_epoch_loss}, total_train_steps) - - else: - eval_epoch_loss, eval_metric, temp_val_loss, temp_step_metric = evaluation_helper( - model, train_config, eval_dataloader, device - ) + eval_epoch_loss, eval_metric, temp_val_loss, temp_step_metric = evaluation_helper( + model, train_config, eval_dataloader, device + ) + if is_rank_zero(): tensorboard_updates.add_scalars("loss", {"eval": eval_epoch_loss}, total_train_steps) if train_config.save_metrics: @@ -376,15 +344,15 @@ def train( if train_config.run_validation: if eval_epoch_loss < best_val_loss: best_val_loss = eval_epoch_loss - print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") + logger.log_rank_zero(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") val_loss.append(float(eval_epoch_loss)) val_metric.append(float(eval_metric)) if train_config.task_type == "seq_classification": - print( + logger.log_rank_zero( f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" ) else: - print( + logger.log_rank_zero( f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" ) @@ -431,6 +399,9 @@ def evaluation_helper(model, train_config, eval_dataloader, device): Returns: eval_epoch_loss, eval_metric, eval_step_loss, eval_step_metric """ + if train_config.enable_ddp: + dist.barrier() + model.eval() if train_config.task_type == "seq_classification": @@ -500,7 +471,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): eval_metric = torch.exp(eval_epoch_loss) # Print evaluation metrics - print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") + logger.log_rank_zero(f"{eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric @@ -513,18 +484,28 @@ def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: return longest_seq_length, longest_seq_ix -def print_model_size(model, config) -> None: +def print_model_size(model) -> None: """ Print model name, the number of trainable parameters and initialization time. Args: - model: The PyTorch model. - model_name (str): Name of the model. + model: PyTorch model. """ - - print(f"--> Model {config.model_name}") total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - print(f"\n--> {config.model_name} has {total_params / 1e6} Million params\n") + logger.log_rank_zero(f"Model has {total_params / 1e6} Million params.") + + +def print_trainable_parameters(model) -> None: + """ + Print the number of trainable parameters, all params and percentage of trainablke params. + + Args: + model: The PyTorch model. + """ + trainable_params, all_param = model.get_nb_trainable_parameters() + logger.log_rank_zero( + f"Trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}" + ) def save_to_json( From 432dcf5bb908840134223016df89c0b3207c30e1 Mon Sep 17 00:00:00 2001 From: quic-akuruvil Date: Tue, 1 Jul 2025 13:43:25 +0530 Subject: [PATCH 17/22] Gemma 3 minor fixes (#476) - CPR (#484) CI enablement and other minor fixes for Gemma3 --------- --------- Signed-off-by: Ann Kuruvilla Signed-off-by: Dipankar Sarkar Co-authored-by: Dipankar Sarkar Signed-off-by: Amit Raj From ad6fc6685fff81f6670173e50b7428852fbdbd22 Mon Sep 17 00:00:00 2001 From: Hem Agnihotri Date: Tue, 1 Jul 2025 13:48:40 +0530 Subject: [PATCH 18/22] Revert "Gemma 3 minor fixes (#476) - CPR" (#485) Reverts quic/efficient-transformers#484 Signed-off-by: Amit Raj From cab6243a1aa759e52cbd5d06e6b54b1122daad9d Mon Sep 17 00:00:00 2001 From: quic-akuruvil Date: Tue, 1 Jul 2025 13:43:25 +0530 Subject: [PATCH 19/22] Gemma 3 minor fixes (#476) - CPR (#484) CI enablement and other minor fixes for Gemma3 --------- --------- Signed-off-by: Ann Kuruvilla Signed-off-by: Dipankar Sarkar Co-authored-by: Dipankar Sarkar Signed-off-by: Amit Raj From 0101967bef07cd2f65418c94205b4f7ac0ab43dd Mon Sep 17 00:00:00 2001 From: Hem Agnihotri Date: Tue, 1 Jul 2025 13:48:40 +0530 Subject: [PATCH 20/22] Revert "Gemma 3 minor fixes (#476) - CPR" (#485) Reverts quic/efficient-transformers#484 Signed-off-by: Amit Raj From 5a6a7b72f8e71422b082249aa8b1e12f2ce41b8f Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 6 Jun 2025 06:34:54 +0000 Subject: [PATCH 21/22] Create a constant value for MIN_MASKED_ATTN_VALUE Signed-off-by: Amit Raj --- QEfficient/transformers/modeling_utils.py | 11 ++++++++--- .../transformers/models/codegen/modeling_codegen.py | 6 +++--- .../transformers/models/falcon/modeling_falcon.py | 5 ++++- .../transformers/models/gemma/modeling_gemma.py | 5 ++++- .../transformers/models/gemma2/modeling_gemma2.py | 5 ++++- QEfficient/transformers/models/gpt2/modeling_gpt2.py | 7 +++++-- QEfficient/transformers/models/gptj/modeling_gptj.py | 5 ++++- .../transformers/models/granite/modeling_granite.py | 5 ++++- .../models/granitemoe/modeling_granitemoe.py | 5 ++++- .../transformers/models/llama/modeling_llama.py | 5 ++++- .../models/llama_swiftkv/modeling_llama_swiftkv.py | 5 ++++- .../transformers/models/mistral/modeling_mistral.py | 5 ++++- .../models/mixtral_moe/modeling_mixtral.py | 5 ++++- .../transformers/models/mllama/modeling_mllama.py | 7 +++++-- QEfficient/transformers/models/mpt/modeling_mpt.py | 3 ++- QEfficient/transformers/models/phi/modeling_phi.py | 5 ++++- QEfficient/transformers/models/phi3/modeling_phi3.py | 5 ++++- .../transformers/models/qwen2/modeling_qwen2.py | 5 ++++- .../models/starcoder2/modeling_starcoder2.py | 5 ++++- .../transformers/models/whisper/modeling_whisper.py | 5 ++++- QEfficient/utils/constants.py | 2 ++ 21 files changed, 85 insertions(+), 26 deletions(-) diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index 0a0e4d54b..72b7acd98 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -88,6 +88,7 @@ ) from QEfficient.customop import CustomRMSNormAIC +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE # Placeholder for all non-transformer models from .models.codegen.modeling_codegen import ( @@ -307,12 +308,12 @@ def _prepare_cross_attention_mask( # invert the mask inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype) cross_attention_mask = inverted_cross_attn_mask.masked_fill( - inverted_cross_attn_mask.to(torch.bool), torch.tensor(-10000.0, dtype=torch.float32) + inverted_cross_attn_mask.to(torch.bool), torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32) ) # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's # last dimension contains negative infinity values, otherwise it's 1 - negative_inf_value = torch.tensor(-10000.0, dtype=torch.float32) + negative_inf_value = torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32) full_text_row_masked_out_mask = ( (cross_attention_mask != negative_inf_value).any(dim=-1).type_as(cross_attention_mask)[..., None] ) @@ -342,7 +343,11 @@ def _prepare_aspect_ratio_attention_mask( # Reshape to 2D and create 4D attention mask # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length) attention_mask = attention_mask.reshape(batch_size, max_num_tiles * target_length, 1) - attention_mask = attention_mask @ attention_mask.transpose(-1, -2) * torch.tensor(-10000.0, dtype=torch.float32) + attention_mask = ( + attention_mask + @ attention_mask.transpose(-1, -2) + * torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32) + ) attention_mask = attention_mask.unsqueeze(1) return attention_mask diff --git a/QEfficient/transformers/models/codegen/modeling_codegen.py b/QEfficient/transformers/models/codegen/modeling_codegen.py index e0f6b5196..e75181424 100644 --- a/QEfficient/transformers/models/codegen/modeling_codegen.py +++ b/QEfficient/transformers/models/codegen/modeling_codegen.py @@ -23,6 +23,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffCodeGenAttention(CodeGenAttention): @@ -47,11 +48,10 @@ def _attn( attn_weights = torch.matmul(query, key.transpose(-1, -2)) attn_weights = attn_weights / self.scale_attn - # Minimum value for causal mask - mask_value = -10000.0 + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` - mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device) + mask_value = torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=attn_weights.dtype).to(attn_weights.device) if attention_mask is not None: # Apply the attention mask diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py index 593d17f1b..c79df615e 100644 --- a/QEfficient/transformers/models/falcon/modeling_falcon.py +++ b/QEfficient/transformers/models/falcon/modeling_falcon.py @@ -31,6 +31,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffFalconRotaryEmbedding(FalconRotaryEmbedding): @@ -148,7 +149,9 @@ def forward( attention_scores = query_layer @ key_layer.transpose(-1, -2) attention_scores /= math.sqrt(self.head_dim) - attention_scores = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attention_scores) + attention_scores = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attention_scores + ) attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype) # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi). attn_output = attention_scores @ value_layer diff --git a/QEfficient/transformers/models/gemma/modeling_gemma.py b/QEfficient/transformers/models/gemma/modeling_gemma.py index bd5e85d84..0cefbcfee 100644 --- a/QEfficient/transformers/models/gemma/modeling_gemma.py +++ b/QEfficient/transformers/models/gemma/modeling_gemma.py @@ -27,6 +27,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffGemmaRotaryEmbedding(GemmaRotaryEmbedding): @@ -110,7 +111,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/gemma2/modeling_gemma2.py b/QEfficient/transformers/models/gemma2/modeling_gemma2.py index fa0b3cc49..173da1798 100644 --- a/QEfficient/transformers/models/gemma2/modeling_gemma2.py +++ b/QEfficient/transformers/models/gemma2/modeling_gemma2.py @@ -30,6 +30,7 @@ # from transformers.utils import is_torchdynamo_compiling from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffGemma2RotaryEmbedding(Gemma2RotaryEmbedding): @@ -116,7 +117,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/gpt2/modeling_gpt2.py b/QEfficient/transformers/models/gpt2/modeling_gpt2.py index 0b458fbbe..7ce125a21 100644 --- a/QEfficient/transformers/models/gpt2/modeling_gpt2.py +++ b/QEfficient/transformers/models/gpt2/modeling_gpt2.py @@ -17,6 +17,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE def eager_attention_forward(module, query, key, value, attention_mask, head_mask=None, **kwargs): @@ -30,7 +31,7 @@ def eager_attention_forward(module, query, key, value, attention_mask, head_mask # if only "normal" attention layer implements causal mask query_length, key_length = query.size(-2), key.size(-2) causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length] - mask_value = -10000.0 + mask_value = MIN_MASKED_ATTENTION_VALUE # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device) @@ -38,7 +39,9 @@ def eager_attention_forward(module, query, key, value, attention_mask, head_mask if attention_mask is not None: # Apply the attention mask - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1) diff --git a/QEfficient/transformers/models/gptj/modeling_gptj.py b/QEfficient/transformers/models/gptj/modeling_gptj.py index 5daa4ace3..6b11e3f4f 100644 --- a/QEfficient/transformers/models/gptj/modeling_gptj.py +++ b/QEfficient/transformers/models/gptj/modeling_gptj.py @@ -28,6 +28,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor: @@ -62,7 +63,9 @@ def _attn( if attention_mask is not None: # Apply the attention mask - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1) attn_weights = attn_weights.to(value.dtype) diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py index af4ebfc92..13b308547 100644 --- a/QEfficient/transformers/models/granite/modeling_granite.py +++ b/QEfficient/transformers/models/granite/modeling_granite.py @@ -26,6 +26,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffGraniteRotaryEmbedding(GraniteRotaryEmbedding): @@ -107,7 +108,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py index 6e99e2ffa..8f840b4b4 100644 --- a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py +++ b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py @@ -29,6 +29,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffGraniteMoeRotaryEmbedding(GraniteMoeRotaryEmbedding): @@ -153,7 +154,9 @@ def forward( attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) dropout = 0.0 if not self.training else self.attention_dropout diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py index 0cccd7fcf..a285f00dc 100644 --- a/QEfficient/transformers/models/llama/modeling_llama.py +++ b/QEfficient/transformers/models/llama/modeling_llama.py @@ -27,6 +27,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffLlamaRotaryEmbedding(LlamaRotaryEmbedding): @@ -109,7 +110,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py index 7b96aefcc..f5e60c5de 100644 --- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py +++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py @@ -29,6 +29,7 @@ QEffLlamaRotaryEmbedding, qeff_apply_rotary_pos_emb, ) +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffLlamaSwiftKVConfig(LlamaConfig): @@ -120,7 +121,9 @@ def forward( attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attention_mask is not None: # no matter the length, we just slice it - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) # upcast attention to fp32 attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) # attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py index 59c19baa2..60b1c929d 100644 --- a/QEfficient/transformers/models/mistral/modeling_mistral.py +++ b/QEfficient/transformers/models/mistral/modeling_mistral.py @@ -31,6 +31,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffMistralRotaryEmbedding(MistralRotaryEmbedding): @@ -114,7 +115,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py index 808f6baf2..ef51c3421 100644 --- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py +++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py @@ -32,6 +32,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffMixtralRotaryEmbedding(MixtralRotaryEmbedding): @@ -115,7 +116,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 1cfafae58..58999e028 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -43,6 +43,7 @@ ) from QEfficient.utils import constants from QEfficient.utils._utils import IOInfo +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE MAX_NUM_IMG = 1 NUM_CHANNEL = 3 @@ -179,7 +180,7 @@ def forward( causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] attn_weights = attn_weights + causal_mask # attn_weights = torch.where( - # attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights + # attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights # ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) @@ -256,7 +257,9 @@ def forward( attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attention_mask is not None: # no matter the length, we just slice it - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) # upcast attention to fp32 attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) diff --git a/QEfficient/transformers/models/mpt/modeling_mpt.py b/QEfficient/transformers/models/mpt/modeling_mpt.py index 359a32672..89d474e15 100644 --- a/QEfficient/transformers/models/mpt/modeling_mpt.py +++ b/QEfficient/transformers/models/mpt/modeling_mpt.py @@ -21,6 +21,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffMptAttention(MptAttention): @@ -78,7 +79,7 @@ def forward( if attention_mask is not None: attention_scores = torch.where( - attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attention_scores + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attention_scores ) # (batch_size, n_heads, seq_length, key_length) diff --git a/QEfficient/transformers/models/phi/modeling_phi.py b/QEfficient/transformers/models/phi/modeling_phi.py index e08dfa528..18557f1ca 100644 --- a/QEfficient/transformers/models/phi/modeling_phi.py +++ b/QEfficient/transformers/models/phi/modeling_phi.py @@ -24,6 +24,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE def eager_attention_forward( @@ -40,7 +41,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py index 3a54a1e83..602a73c84 100644 --- a/QEfficient/transformers/models/phi3/modeling_phi3.py +++ b/QEfficient/transformers/models/phi3/modeling_phi3.py @@ -27,6 +27,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffPhi3RotaryEmbedding(Phi3RotaryEmbedding): @@ -108,7 +109,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py index 67c71b32c..00a3989d8 100644 --- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py +++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py @@ -30,6 +30,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE # Can be replaced with llama/modeling_llama.py::QEffLlamaRotaryEmbedding but keeping it following transformers ideology @@ -124,7 +125,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py index 9ea508f5c..e3db4b490 100644 --- a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py +++ b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py @@ -27,6 +27,7 @@ from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE def eager_attention_forward( @@ -44,7 +45,9 @@ def eager_attention_forward( attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py index 2dda9ed96..afa2a6b07 100644 --- a/QEfficient/transformers/models/whisper/modeling_whisper.py +++ b/QEfficient/transformers/models/whisper/modeling_whisper.py @@ -30,6 +30,7 @@ from QEfficient.transformers.cache_utils import QEffEncoderDecoderCache from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask from QEfficient.utils._utils import IOInfo +from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE class QEffWhisperPositionalEmbedding(WhisperPositionalEmbedding): @@ -116,7 +117,9 @@ def forward( f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" ) # updated to use torch.where, to prevent overflow in fp16 computation - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + attn_weights = torch.where( + attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights + ) attn_weights = nn.functional.softmax(attn_weights, dim=-1) diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 5e855094c..9ad51c590 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -28,6 +28,8 @@ # Compiler defaults DEFAULT_AIC_NUM_CORES = 16 DEFAULT_AIC_MXPF6_MATMUL = False +# Minimum value for causal mask +MIN_MASKED_ATTENTION_VALUE = -1e4 # Store the qeff_models inside the ~/.cache directory or over-ride with an env variable. From 1673f30fbf67f1282baf8f012a086955c36ca9a5 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 12 Jun 2025 07:57:15 +0000 Subject: [PATCH 22/22] Value update for mask Signed-off-by: Amit Raj --- QEfficient/transformers/models/gpt2/modeling_gpt2.py | 3 +-- QEfficient/transformers/models/mllama/modeling_mllama.py | 3 --- QEfficient/utils/constants.py | 2 +- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/QEfficient/transformers/models/gpt2/modeling_gpt2.py b/QEfficient/transformers/models/gpt2/modeling_gpt2.py index 7ce125a21..a2b84c139 100644 --- a/QEfficient/transformers/models/gpt2/modeling_gpt2.py +++ b/QEfficient/transformers/models/gpt2/modeling_gpt2.py @@ -31,10 +31,9 @@ def eager_attention_forward(module, query, key, value, attention_mask, head_mask # if only "normal" attention layer implements causal mask query_length, key_length = query.size(-2), key.size(-2) causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length] - mask_value = MIN_MASKED_ATTENTION_VALUE # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` - mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + mask_value = torch.full([], MIN_MASKED_ATTENTION_VALUE, dtype=attn_weights.dtype, device=attn_weights.device) attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value) if attention_mask is not None: diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 58999e028..8a98c4c96 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -179,9 +179,6 @@ def forward( if attention_mask is not None: # no matter the length, we just slice it causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] attn_weights = attn_weights + causal_mask - # attn_weights = torch.where( - # attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), attn_weights - # ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) attn_output = torch.matmul(attn_weights, value_states) diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 9ad51c590..50f36ea32 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -29,7 +29,7 @@ DEFAULT_AIC_NUM_CORES = 16 DEFAULT_AIC_MXPF6_MATMUL = False # Minimum value for causal mask -MIN_MASKED_ATTENTION_VALUE = -1e4 +MIN_MASKED_ATTENTION_VALUE = float("-inf") # Store the qeff_models inside the ~/.cache directory or over-ride with an env variable.