From df3d6a9ee505c4948f441b70e467ae32ce953186 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Thu, 17 Oct 2024 04:46:11 +0000 Subject: [PATCH 1/5] Consistency Check Failure is Fixed Signed-off-by: Abukhoyer Shaik --- QEfficient/utils/_utils.py | 583 +++++++++++++++++----------------- QEfficient/utils/constants.py | 1 + 2 files changed, 297 insertions(+), 287 deletions(-) diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 3d1620b3b..2150b80f7 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -1,287 +1,296 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import os -from typing import List, Optional, Tuple, Union - -import requests -from huggingface_hub import login, snapshot_download -from requests.exceptions import HTTPError -from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast - -from QEfficient.utils.constants import QEFF_MODELS_DIR -from QEfficient.utils.logging_utils import logger - - -def login_and_download_hf_lm(model_name, *args, **kwargs): - logger.info(f"loading HuggingFace model for {model_name}") - hf_token = kwargs.pop("hf_token", None) - cache_dir = kwargs.pop("cache_dir", None) - if hf_token is not None: - login(hf_token) - model_path = hf_download( - repo_id=model_name, - cache_dir=cache_dir, - ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], - ) - return model_path - - -def hf_download( - repo_id: Optional[str] = None, - cache_dir: Optional[str] = None, - hf_token: Optional[str] = None, - allow_patterns: Optional[List[str]] = None, - ignore_patterns: Optional[List[str]] = None, -): - # Setup cache_dir - if cache_dir is not None: - os.makedirs(cache_dir, exist_ok=True) - - max_retries = 5 - retry_count = 0 - while retry_count < max_retries: - try: - model_path = snapshot_download( - repo_id, - cache_dir=cache_dir, - revision="main", - resume_download=True, - token=hf_token, - allow_patterns=allow_patterns, - ignore_patterns=ignore_patterns, - ) - break - except requests.ReadTimeout as e: - logger.info(f"Read timeout: {e}") - retry_count += 1 - - except HTTPError as e: - retry_count = max_retries - if e.response.status_code == 401: - logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.") - else: - raise e - - return model_path - - -def qpc_exists(qpc_dir_path: str) -> bool: - """ - Checks if qpc dir exists. - Returns - 1. Boolean variable indicating if qpc files exist - 2. Path of the qpc dir if found. - --------- - - :model_name: `str` - HF Model card name. - :dir_path: `str` - Path of qpc directory. - - Return: - qpc_exists and path to qpc directory - """ - - # Compute the boolean indicating if the QPC exists - qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")) - - return qpc_exists_bool - - -def get_onnx_dir_name(model_name, has_fbs): - # Create a unique directory name for the ONNX model - # Clearly indicate whether it's with or without FBS - # Replace all hyphens with underscores - model_name_safe = model_name.replace("/", "_").replace("-", "_") - if has_fbs: - return f"onnx_{model_name_safe}_with_fbs" - else: - return f"onnx_{model_name_safe}_without_fbs" - - -def onnx_exists(model_name: str, full_batch_size: int) -> Tuple[bool, str, str]: - """ - Checks if qpc files already exists, removes the directory if files have been manipulated. - --------- - - :model_name: `str`- HF Model card name. - - Return: - onnx_exists and path to onnx file and directory - """ - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - os.makedirs(model_card_dir, exist_ok=True) - - # Determine if we're using full_batch_size - has_fbs = full_batch_size is not None - - # ONNX handling - onnx_dir_name = get_onnx_dir_name(model_name, has_fbs) - onnx_dir_path = os.path.join(model_card_dir, onnx_dir_name) - os.makedirs(onnx_dir_path, exist_ok=True) - clipped_onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - unclipped_onnx_model_path = clipped_onnx_model_path.replace("_clipped_fp16.onnx", ".onnx") - - # Compute the boolean indicating if the ONNX model exists - onnx_exists_bool = False - onnx_model_path = None - if os.path.isfile(os.path.join(onnx_dir_path, "custom_io_fp16.yaml")): - if os.path.isfile(clipped_onnx_model_path): - onnx_exists_bool = True - onnx_model_path = clipped_onnx_model_path - elif os.path.isfile(unclipped_onnx_model_path): - onnx_exists_bool = True - onnx_model_path = unclipped_onnx_model_path - - # Return the boolean, onnx_dir_path, and onnx_model_path - return onnx_exists_bool, onnx_dir_path, onnx_model_path - - -def load_hf_tokenizer( - pretrained_model_name_or_path: str, - cache_dir: Optional[str] = None, - hf_token: Optional[str] = None, - padding_side: str = "right", - **kwargs, -) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: - # FIXME: Fix kwargs to take token, cache_dir and pass via kwargs only on line 129 - logger.info("Loading Tokenizer") - if hf_token is not None: - login(hf_token) - # Download tokenizer along with model if it doesn't exist - model_hf_path = ( - pretrained_model_name_or_path - if os.path.isdir(pretrained_model_name_or_path) - else hf_download( - repo_id=pretrained_model_name_or_path, - cache_dir=cache_dir, - allow_patterns=["*.json", "*.py", "*token*", "*.txt"], - ) - ) - tokenizer = AutoTokenizer.from_pretrained( - model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs - ) - padding_check_and_fix(tokenizer) # Check and fix tokenizer viability - - return tokenizer - - -def get_qpc_dir_path( - model_card_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size -): - # Create a unique directory name for the QPC model based on all parameters - qpc_base_dir_name = ( - f"qpc_{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos" - + f"{f'_{full_batch_size}fbs_' if full_batch_size is not None else '_'}" - + f"{len(device_group) if device_group is not None else 1}" - + "devices" - + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16") - ) - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_card_name)) - os.makedirs(model_card_dir, exist_ok=True) - - qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs") - return qpc_dir_path - - -def check_and_assign_cache_dir(local_model_dir, cache_dir): - if local_model_dir is not None: - if cache_dir is not None: - logger.warning( - f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir." - ) - return None - return cache_dir if cache_dir else None - - -def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None: - """ - Checks and fixes tokenizer padding side and pad_token_id viability. - -------- - - tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix. - """ - if tokenizer.padding_side != "right": - logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}") - tokenizer.padding_side = "right" - - if tokenizer.pad_token_id is None: - assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int" - # If Pad token is out of range of vocab size - if tokenizer.eos_token_id < tokenizer.vocab_size: - tokenizer.pad_token_id = tokenizer.eos_token_id - else: - tokenizer.pad_token_id = tokenizer.vocab_size - 1 - - -def get_padding_shape_from_config(config, batch_size, seq_len): - """ - Gets padding dims from model config - number of kv heads and d_head - and returns padding shape - (batch_size, number of kv heads, seq_len, hidden size) - required for initialization of past_key_values - -------- - - :config: AutoConfig from pretrained model. - :batch_size: int. number of input prompts used to create inputs - :seq_len: int. sequence length to run the model for. - - Return: - List[int, int, int, int] - """ - - if hasattr(config, "n_head"): # Assuming n_head is a key in the config (GPTs/CodeGen) - n_heads = config.n_head - d_head = config.n_embd // config.n_head - elif hasattr(config, "num_key_value_heads") and hasattr( - config, "num_attention_heads" - ): # Check for num_key_value_heads (Llama/Mistral) - n_heads = config.num_key_value_heads - d_head = config.hidden_size // config.num_attention_heads - elif hasattr(config, "n_heads"): # Check for n_heads and d_model in the config (MPT Model) - n_heads = config.n_heads - d_head = config.d_model // config.n_heads - elif hasattr(config, "new_decoder_architecture"): # Check for Falcon - new_decoder_architecture = getattr(config, "new_decoder_architecture") - if new_decoder_architecture: # multi_query is ignored when new_decoder_architecture is True - n_heads = config.num_attention_heads - else: - if hasattr(config, "multi_query"): - multi_query_value = getattr(config, "multi_query") - if multi_query_value: - n_heads = 1 # MQA , multi query is true - else: - n_heads = config.num_attention_heads - d_head = config.hidden_size // config.num_attention_heads - else: - raise ValueError("Invalid model configuration: n_head/d_heads or num_key_value_heads not found.") - padding_shape = [batch_size, n_heads, seq_len, d_head] - if hasattr(config, "architectures") and config.architectures is not None: # Check for Starcoder1 - 3D layout - if "GPTBigCodeForCausalLM" in config.architectures: - padding_shape = [batch_size, seq_len, d_head] - return padding_shape - - -def get_num_layers_from_config(config): - """ - Gets number of layers from model config - -------- - - :config: AutoConfig from pretrained model. - - Return: - number of layers - """ - - if hasattr(config, "n_layer"): # Assuming n_layer is a key in the config (GPTs/CodeGen) - n_layer = config.n_layer - elif hasattr(config, "num_hidden_layers"): # llama/Mistral/Falcon - n_layer = config.num_hidden_layers - elif hasattr(config, "n_layers"): # Check for n_layers in the config (MPT Model) - n_layer = config.n_layers - else: - raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.") - - return n_layer +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import os +from typing import List, Optional, Tuple, Union + +import requests +from huggingface_hub import login, snapshot_download +from requests.exceptions import HTTPError +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + +from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants +from QEfficient.utils.logging_utils import logger + + +def login_and_download_hf_lm(model_name, *args, **kwargs): + logger.info(f"loading HuggingFace model for {model_name}") + hf_token = kwargs.pop("hf_token", None) + cache_dir = kwargs.pop("cache_dir", None) + if hf_token is not None: + login(hf_token) + model_path = hf_download( + repo_id=model_name, + cache_dir=cache_dir, + ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], + ) + return model_path + + +def hf_download( + repo_id: Optional[str] = None, + cache_dir: Optional[str] = None, + hf_token: Optional[str] = None, + allow_patterns: Optional[List[str]] = None, + ignore_patterns: Optional[List[str]] = None, +): + # Setup cache_dir + if cache_dir is not None: + os.makedirs(cache_dir, exist_ok=True) + + max_retries = Constants.MAX_RETRIES + retry_count = 0 + while retry_count < max_retries: + try: + model_path = snapshot_download( + repo_id, + cache_dir=cache_dir, + revision="main", + resume_download=True, + token=hf_token, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + ) + break + except requests.ReadTimeout as e: + logger.info(f"Read timeout: {e}") + retry_count += 1 + + except HTTPError as e: + retry_count = max_retries + if e.response.status_code == 401: + logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.") + else: + raise e + + except OSError as e: + logger.error(f"OSError: {e}") + if "Consistency check failed" in str(e): + logger.info( + "OSError: Consistency check failed: file should not be incomplete, Resuming the downloading..." + ) + else: + raise e + + return model_path + + +def qpc_exists(qpc_dir_path: str) -> bool: + """ + Checks if qpc dir exists. + Returns + 1. Boolean variable indicating if qpc files exist + 2. Path of the qpc dir if found. + --------- + + :model_name: `str` - HF Model card name. + :dir_path: `str` - Path of qpc directory. + + Return: + qpc_exists and path to qpc directory + """ + + # Compute the boolean indicating if the QPC exists + qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")) + + return qpc_exists_bool + + +def get_onnx_dir_name(model_name, has_fbs): + # Create a unique directory name for the ONNX model + # Clearly indicate whether it's with or without FBS + # Replace all hyphens with underscores + model_name_safe = model_name.replace("/", "_").replace("-", "_") + if has_fbs: + return f"onnx_{model_name_safe}_with_fbs" + else: + return f"onnx_{model_name_safe}_without_fbs" + + +def onnx_exists(model_name: str, full_batch_size: int) -> Tuple[bool, str, str]: + """ + Checks if qpc files already exists, removes the directory if files have been manipulated. + --------- + + :model_name: `str`- HF Model card name. + + Return: + onnx_exists and path to onnx file and directory + """ + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) + os.makedirs(model_card_dir, exist_ok=True) + + # Determine if we're using full_batch_size + has_fbs = full_batch_size is not None + + # ONNX handling + onnx_dir_name = get_onnx_dir_name(model_name, has_fbs) + onnx_dir_path = os.path.join(model_card_dir, onnx_dir_name) + os.makedirs(onnx_dir_path, exist_ok=True) + clipped_onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") + unclipped_onnx_model_path = clipped_onnx_model_path.replace("_clipped_fp16.onnx", ".onnx") + + # Compute the boolean indicating if the ONNX model exists + onnx_exists_bool = False + onnx_model_path = None + if os.path.isfile(os.path.join(onnx_dir_path, "custom_io_fp16.yaml")): + if os.path.isfile(clipped_onnx_model_path): + onnx_exists_bool = True + onnx_model_path = clipped_onnx_model_path + elif os.path.isfile(unclipped_onnx_model_path): + onnx_exists_bool = True + onnx_model_path = unclipped_onnx_model_path + + # Return the boolean, onnx_dir_path, and onnx_model_path + return onnx_exists_bool, onnx_dir_path, onnx_model_path + + +def load_hf_tokenizer( + pretrained_model_name_or_path: str, + cache_dir: Optional[str] = None, + hf_token: Optional[str] = None, + padding_side: str = "right", + **kwargs, +) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: + # FIXME: Fix kwargs to take token, cache_dir and pass via kwargs only on line 129 + logger.info("Loading Tokenizer") + if hf_token is not None: + login(hf_token) + # Download tokenizer along with model if it doesn't exist + model_hf_path = ( + pretrained_model_name_or_path + if os.path.isdir(pretrained_model_name_or_path) + else hf_download( + repo_id=pretrained_model_name_or_path, + cache_dir=cache_dir, + allow_patterns=["*.json", "*.py", "*token*", "*.txt"], + ) + ) + tokenizer = AutoTokenizer.from_pretrained( + model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs + ) + padding_check_and_fix(tokenizer) # Check and fix tokenizer viability + + return tokenizer + + +def get_qpc_dir_path( + model_card_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size +): + # Create a unique directory name for the QPC model based on all parameters + qpc_base_dir_name = ( + f"qpc_{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos" + + f"{f'_{full_batch_size}fbs_' if full_batch_size is not None else '_'}" + + f"{len(device_group) if device_group is not None else 1}" + + "devices" + + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16") + ) + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_card_name)) + os.makedirs(model_card_dir, exist_ok=True) + + qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs") + return qpc_dir_path + + +def check_and_assign_cache_dir(local_model_dir, cache_dir): + if local_model_dir is not None: + if cache_dir is not None: + logger.warning( + f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir." + ) + return None + return cache_dir if cache_dir else None + + +def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None: + """ + Checks and fixes tokenizer padding side and pad_token_id viability. + -------- + + tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix. + """ + if tokenizer.padding_side != "right": + logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}") + tokenizer.padding_side = "right" + + if tokenizer.pad_token_id is None: + assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int" + # If Pad token is out of range of vocab size + if tokenizer.eos_token_id < tokenizer.vocab_size: + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + tokenizer.pad_token_id = tokenizer.vocab_size - 1 + + +def get_padding_shape_from_config(config, batch_size, seq_len): + """ + Gets padding dims from model config - number of kv heads and d_head + and returns padding shape - (batch_size, number of kv heads, seq_len, hidden size) + required for initialization of past_key_values + -------- + + :config: AutoConfig from pretrained model. + :batch_size: int. number of input prompts used to create inputs + :seq_len: int. sequence length to run the model for. + + Return: + List[int, int, int, int] + """ + + if hasattr(config, "n_head"): # Assuming n_head is a key in the config (GPTs/CodeGen) + n_heads = config.n_head + d_head = config.n_embd // config.n_head + elif hasattr(config, "num_key_value_heads") and hasattr( + config, "num_attention_heads" + ): # Check for num_key_value_heads (Llama/Mistral) + n_heads = config.num_key_value_heads + d_head = config.hidden_size // config.num_attention_heads + elif hasattr(config, "n_heads"): # Check for n_heads and d_model in the config (MPT Model) + n_heads = config.n_heads + d_head = config.d_model // config.n_heads + elif hasattr(config, "new_decoder_architecture"): # Check for Falcon + new_decoder_architecture = getattr(config, "new_decoder_architecture") + if new_decoder_architecture: # multi_query is ignored when new_decoder_architecture is True + n_heads = config.num_attention_heads + else: + if hasattr(config, "multi_query"): + multi_query_value = getattr(config, "multi_query") + if multi_query_value: + n_heads = 1 # MQA , multi query is true + else: + n_heads = config.num_attention_heads + d_head = config.hidden_size // config.num_attention_heads + else: + raise ValueError("Invalid model configuration: n_head/d_heads or num_key_value_heads not found.") + padding_shape = [batch_size, n_heads, seq_len, d_head] + if hasattr(config, "architectures") and config.architectures is not None: # Check for Starcoder1 - 3D layout + if "GPTBigCodeForCausalLM" in config.architectures: + padding_shape = [batch_size, seq_len, d_head] + return padding_shape + + +def get_num_layers_from_config(config): + """ + Gets number of layers from model config + -------- + + :config: AutoConfig from pretrained model. + + Return: + number of layers + """ + + if hasattr(config, "n_layer"): # Assuming n_layer is a key in the config (GPTs/CodeGen) + n_layer = config.n_layer + elif hasattr(config, "num_hidden_layers"): # llama/Mistral/Falcon + n_layer = config.num_hidden_layers + elif hasattr(config, "n_layers"): # Check for n_layers in the config (MPT Model) + n_layer = config.n_layers + else: + raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.") + + return n_layer diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 32a9e0e4a..0bc23a2c2 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -55,3 +55,4 @@ class Constants: INPUT_STR = ["My name is"] GB = 2**30 MAX_QPC_LIMIT = 30 + MAX_RETRIES = 5 From 87465ec816272b6fbcb9d7cb120dc96db6d6b021 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Thu, 17 Oct 2024 10:42:10 +0530 Subject: [PATCH 2/5] linter fixed I Signed-off-by: Abukhoyer Shaik --- QEfficient/utils/_utils.py | 592 ++++++++++++++++++------------------- 1 file changed, 296 insertions(+), 296 deletions(-) diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 2150b80f7..224935d53 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -1,296 +1,296 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import os -from typing import List, Optional, Tuple, Union - -import requests -from huggingface_hub import login, snapshot_download -from requests.exceptions import HTTPError -from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast - -from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants -from QEfficient.utils.logging_utils import logger - - -def login_and_download_hf_lm(model_name, *args, **kwargs): - logger.info(f"loading HuggingFace model for {model_name}") - hf_token = kwargs.pop("hf_token", None) - cache_dir = kwargs.pop("cache_dir", None) - if hf_token is not None: - login(hf_token) - model_path = hf_download( - repo_id=model_name, - cache_dir=cache_dir, - ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], - ) - return model_path - - -def hf_download( - repo_id: Optional[str] = None, - cache_dir: Optional[str] = None, - hf_token: Optional[str] = None, - allow_patterns: Optional[List[str]] = None, - ignore_patterns: Optional[List[str]] = None, -): - # Setup cache_dir - if cache_dir is not None: - os.makedirs(cache_dir, exist_ok=True) - - max_retries = Constants.MAX_RETRIES - retry_count = 0 - while retry_count < max_retries: - try: - model_path = snapshot_download( - repo_id, - cache_dir=cache_dir, - revision="main", - resume_download=True, - token=hf_token, - allow_patterns=allow_patterns, - ignore_patterns=ignore_patterns, - ) - break - except requests.ReadTimeout as e: - logger.info(f"Read timeout: {e}") - retry_count += 1 - - except HTTPError as e: - retry_count = max_retries - if e.response.status_code == 401: - logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.") - else: - raise e - - except OSError as e: - logger.error(f"OSError: {e}") - if "Consistency check failed" in str(e): - logger.info( - "OSError: Consistency check failed: file should not be incomplete, Resuming the downloading..." - ) - else: - raise e - - return model_path - - -def qpc_exists(qpc_dir_path: str) -> bool: - """ - Checks if qpc dir exists. - Returns - 1. Boolean variable indicating if qpc files exist - 2. Path of the qpc dir if found. - --------- - - :model_name: `str` - HF Model card name. - :dir_path: `str` - Path of qpc directory. - - Return: - qpc_exists and path to qpc directory - """ - - # Compute the boolean indicating if the QPC exists - qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")) - - return qpc_exists_bool - - -def get_onnx_dir_name(model_name, has_fbs): - # Create a unique directory name for the ONNX model - # Clearly indicate whether it's with or without FBS - # Replace all hyphens with underscores - model_name_safe = model_name.replace("/", "_").replace("-", "_") - if has_fbs: - return f"onnx_{model_name_safe}_with_fbs" - else: - return f"onnx_{model_name_safe}_without_fbs" - - -def onnx_exists(model_name: str, full_batch_size: int) -> Tuple[bool, str, str]: - """ - Checks if qpc files already exists, removes the directory if files have been manipulated. - --------- - - :model_name: `str`- HF Model card name. - - Return: - onnx_exists and path to onnx file and directory - """ - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) - os.makedirs(model_card_dir, exist_ok=True) - - # Determine if we're using full_batch_size - has_fbs = full_batch_size is not None - - # ONNX handling - onnx_dir_name = get_onnx_dir_name(model_name, has_fbs) - onnx_dir_path = os.path.join(model_card_dir, onnx_dir_name) - os.makedirs(onnx_dir_path, exist_ok=True) - clipped_onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") - unclipped_onnx_model_path = clipped_onnx_model_path.replace("_clipped_fp16.onnx", ".onnx") - - # Compute the boolean indicating if the ONNX model exists - onnx_exists_bool = False - onnx_model_path = None - if os.path.isfile(os.path.join(onnx_dir_path, "custom_io_fp16.yaml")): - if os.path.isfile(clipped_onnx_model_path): - onnx_exists_bool = True - onnx_model_path = clipped_onnx_model_path - elif os.path.isfile(unclipped_onnx_model_path): - onnx_exists_bool = True - onnx_model_path = unclipped_onnx_model_path - - # Return the boolean, onnx_dir_path, and onnx_model_path - return onnx_exists_bool, onnx_dir_path, onnx_model_path - - -def load_hf_tokenizer( - pretrained_model_name_or_path: str, - cache_dir: Optional[str] = None, - hf_token: Optional[str] = None, - padding_side: str = "right", - **kwargs, -) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: - # FIXME: Fix kwargs to take token, cache_dir and pass via kwargs only on line 129 - logger.info("Loading Tokenizer") - if hf_token is not None: - login(hf_token) - # Download tokenizer along with model if it doesn't exist - model_hf_path = ( - pretrained_model_name_or_path - if os.path.isdir(pretrained_model_name_or_path) - else hf_download( - repo_id=pretrained_model_name_or_path, - cache_dir=cache_dir, - allow_patterns=["*.json", "*.py", "*token*", "*.txt"], - ) - ) - tokenizer = AutoTokenizer.from_pretrained( - model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs - ) - padding_check_and_fix(tokenizer) # Check and fix tokenizer viability - - return tokenizer - - -def get_qpc_dir_path( - model_card_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size -): - # Create a unique directory name for the QPC model based on all parameters - qpc_base_dir_name = ( - f"qpc_{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos" - + f"{f'_{full_batch_size}fbs_' if full_batch_size is not None else '_'}" - + f"{len(device_group) if device_group is not None else 1}" - + "devices" - + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16") - ) - model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_card_name)) - os.makedirs(model_card_dir, exist_ok=True) - - qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs") - return qpc_dir_path - - -def check_and_assign_cache_dir(local_model_dir, cache_dir): - if local_model_dir is not None: - if cache_dir is not None: - logger.warning( - f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir." - ) - return None - return cache_dir if cache_dir else None - - -def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None: - """ - Checks and fixes tokenizer padding side and pad_token_id viability. - -------- - - tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix. - """ - if tokenizer.padding_side != "right": - logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}") - tokenizer.padding_side = "right" - - if tokenizer.pad_token_id is None: - assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int" - # If Pad token is out of range of vocab size - if tokenizer.eos_token_id < tokenizer.vocab_size: - tokenizer.pad_token_id = tokenizer.eos_token_id - else: - tokenizer.pad_token_id = tokenizer.vocab_size - 1 - - -def get_padding_shape_from_config(config, batch_size, seq_len): - """ - Gets padding dims from model config - number of kv heads and d_head - and returns padding shape - (batch_size, number of kv heads, seq_len, hidden size) - required for initialization of past_key_values - -------- - - :config: AutoConfig from pretrained model. - :batch_size: int. number of input prompts used to create inputs - :seq_len: int. sequence length to run the model for. - - Return: - List[int, int, int, int] - """ - - if hasattr(config, "n_head"): # Assuming n_head is a key in the config (GPTs/CodeGen) - n_heads = config.n_head - d_head = config.n_embd // config.n_head - elif hasattr(config, "num_key_value_heads") and hasattr( - config, "num_attention_heads" - ): # Check for num_key_value_heads (Llama/Mistral) - n_heads = config.num_key_value_heads - d_head = config.hidden_size // config.num_attention_heads - elif hasattr(config, "n_heads"): # Check for n_heads and d_model in the config (MPT Model) - n_heads = config.n_heads - d_head = config.d_model // config.n_heads - elif hasattr(config, "new_decoder_architecture"): # Check for Falcon - new_decoder_architecture = getattr(config, "new_decoder_architecture") - if new_decoder_architecture: # multi_query is ignored when new_decoder_architecture is True - n_heads = config.num_attention_heads - else: - if hasattr(config, "multi_query"): - multi_query_value = getattr(config, "multi_query") - if multi_query_value: - n_heads = 1 # MQA , multi query is true - else: - n_heads = config.num_attention_heads - d_head = config.hidden_size // config.num_attention_heads - else: - raise ValueError("Invalid model configuration: n_head/d_heads or num_key_value_heads not found.") - padding_shape = [batch_size, n_heads, seq_len, d_head] - if hasattr(config, "architectures") and config.architectures is not None: # Check for Starcoder1 - 3D layout - if "GPTBigCodeForCausalLM" in config.architectures: - padding_shape = [batch_size, seq_len, d_head] - return padding_shape - - -def get_num_layers_from_config(config): - """ - Gets number of layers from model config - -------- - - :config: AutoConfig from pretrained model. - - Return: - number of layers - """ - - if hasattr(config, "n_layer"): # Assuming n_layer is a key in the config (GPTs/CodeGen) - n_layer = config.n_layer - elif hasattr(config, "num_hidden_layers"): # llama/Mistral/Falcon - n_layer = config.num_hidden_layers - elif hasattr(config, "n_layers"): # Check for n_layers in the config (MPT Model) - n_layer = config.n_layers - else: - raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.") - - return n_layer +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import os +from typing import List, Optional, Tuple, Union + +import requests +from huggingface_hub import login, snapshot_download +from requests.exceptions import HTTPError +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + +from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants +from QEfficient.utils.logging_utils import logger + + +def login_and_download_hf_lm(model_name, *args, **kwargs): + logger.info(f"loading HuggingFace model for {model_name}") + hf_token = kwargs.pop("hf_token", None) + cache_dir = kwargs.pop("cache_dir", None) + if hf_token is not None: + login(hf_token) + model_path = hf_download( + repo_id=model_name, + cache_dir=cache_dir, + ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"], + ) + return model_path + + +def hf_download( + repo_id: Optional[str] = None, + cache_dir: Optional[str] = None, + hf_token: Optional[str] = None, + allow_patterns: Optional[List[str]] = None, + ignore_patterns: Optional[List[str]] = None, +): + # Setup cache_dir + if cache_dir is not None: + os.makedirs(cache_dir, exist_ok=True) + + max_retries = Constants.MAX_RETRIES + retry_count = 0 + while retry_count < max_retries: + try: + model_path = snapshot_download( + repo_id, + cache_dir=cache_dir, + revision="main", + resume_download=True, + token=hf_token, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + ) + break + except requests.ReadTimeout as e: + logger.info(f"Read timeout: {e}") + retry_count += 1 + + except HTTPError as e: + retry_count = max_retries + if e.response.status_code == 401: + logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.") + else: + raise e + + except OSError as e: + logger.error(f"OSError: {e}") + if "Consistency check failed" in str(e): + logger.info( + "OSError: Consistency check failed: file should not be incomplete, Resuming the downloading..." + ) + else: + raise e + + return model_path + + +def qpc_exists(qpc_dir_path: str) -> bool: + """ + Checks if qpc dir exists. + Returns + 1. Boolean variable indicating if qpc files exist + 2. Path of the qpc dir if found. + --------- + + :model_name: `str` - HF Model card name. + :dir_path: `str` - Path of qpc directory. + + Return: + qpc_exists and path to qpc directory + """ + + # Compute the boolean indicating if the QPC exists + qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin")) + + return qpc_exists_bool + + +def get_onnx_dir_name(model_name, has_fbs): + # Create a unique directory name for the ONNX model + # Clearly indicate whether it's with or without FBS + # Replace all hyphens with underscores + model_name_safe = model_name.replace("/", "_").replace("-", "_") + if has_fbs: + return f"onnx_{model_name_safe}_with_fbs" + else: + return f"onnx_{model_name_safe}_without_fbs" + + +def onnx_exists(model_name: str, full_batch_size: int) -> Tuple[bool, str, str]: + """ + Checks if qpc files already exists, removes the directory if files have been manipulated. + --------- + + :model_name: `str`- HF Model card name. + + Return: + onnx_exists and path to onnx file and directory + """ + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name)) + os.makedirs(model_card_dir, exist_ok=True) + + # Determine if we're using full_batch_size + has_fbs = full_batch_size is not None + + # ONNX handling + onnx_dir_name = get_onnx_dir_name(model_name, has_fbs) + onnx_dir_path = os.path.join(model_card_dir, onnx_dir_name) + os.makedirs(onnx_dir_path, exist_ok=True) + clipped_onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx") + unclipped_onnx_model_path = clipped_onnx_model_path.replace("_clipped_fp16.onnx", ".onnx") + + # Compute the boolean indicating if the ONNX model exists + onnx_exists_bool = False + onnx_model_path = None + if os.path.isfile(os.path.join(onnx_dir_path, "custom_io_fp16.yaml")): + if os.path.isfile(clipped_onnx_model_path): + onnx_exists_bool = True + onnx_model_path = clipped_onnx_model_path + elif os.path.isfile(unclipped_onnx_model_path): + onnx_exists_bool = True + onnx_model_path = unclipped_onnx_model_path + + # Return the boolean, onnx_dir_path, and onnx_model_path + return onnx_exists_bool, onnx_dir_path, onnx_model_path + + +def load_hf_tokenizer( + pretrained_model_name_or_path: str, + cache_dir: Optional[str] = None, + hf_token: Optional[str] = None, + padding_side: str = "right", + **kwargs, +) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]: + # FIXME: Fix kwargs to take token, cache_dir and pass via kwargs only on line 129 + logger.info("Loading Tokenizer") + if hf_token is not None: + login(hf_token) + # Download tokenizer along with model if it doesn't exist + model_hf_path = ( + pretrained_model_name_or_path + if os.path.isdir(pretrained_model_name_or_path) + else hf_download( + repo_id=pretrained_model_name_or_path, + cache_dir=cache_dir, + allow_patterns=["*.json", "*.py", "*token*", "*.txt"], + ) + ) + tokenizer = AutoTokenizer.from_pretrained( + model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs + ) + padding_check_and_fix(tokenizer) # Check and fix tokenizer viability + + return tokenizer + + +def get_qpc_dir_path( + model_card_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size +): + # Create a unique directory name for the QPC model based on all parameters + qpc_base_dir_name = ( + f"qpc_{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos" + + f"{f'_{full_batch_size}fbs_' if full_batch_size is not None else '_'}" + + f"{len(device_group) if device_group is not None else 1}" + + "devices" + + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16") + ) + model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_card_name)) + os.makedirs(model_card_dir, exist_ok=True) + + qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs") + return qpc_dir_path + + +def check_and_assign_cache_dir(local_model_dir, cache_dir): + if local_model_dir is not None: + if cache_dir is not None: + logger.warning( + f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir." + ) + return None + return cache_dir if cache_dir else None + + +def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None: + """ + Checks and fixes tokenizer padding side and pad_token_id viability. + -------- + + tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix. + """ + if tokenizer.padding_side != "right": + logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}") + tokenizer.padding_side = "right" + + if tokenizer.pad_token_id is None: + assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int" + # If Pad token is out of range of vocab size + if tokenizer.eos_token_id < tokenizer.vocab_size: + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + tokenizer.pad_token_id = tokenizer.vocab_size - 1 + + +def get_padding_shape_from_config(config, batch_size, seq_len): + """ + Gets padding dims from model config - number of kv heads and d_head + and returns padding shape - (batch_size, number of kv heads, seq_len, hidden size) + required for initialization of past_key_values + -------- + + :config: AutoConfig from pretrained model. + :batch_size: int. number of input prompts used to create inputs + :seq_len: int. sequence length to run the model for. + + Return: + List[int, int, int, int] + """ + + if hasattr(config, "n_head"): # Assuming n_head is a key in the config (GPTs/CodeGen) + n_heads = config.n_head + d_head = config.n_embd // config.n_head + elif hasattr(config, "num_key_value_heads") and hasattr( + config, "num_attention_heads" + ): # Check for num_key_value_heads (Llama/Mistral) + n_heads = config.num_key_value_heads + d_head = config.hidden_size // config.num_attention_heads + elif hasattr(config, "n_heads"): # Check for n_heads and d_model in the config (MPT Model) + n_heads = config.n_heads + d_head = config.d_model // config.n_heads + elif hasattr(config, "new_decoder_architecture"): # Check for Falcon + new_decoder_architecture = getattr(config, "new_decoder_architecture") + if new_decoder_architecture: # multi_query is ignored when new_decoder_architecture is True + n_heads = config.num_attention_heads + else: + if hasattr(config, "multi_query"): + multi_query_value = getattr(config, "multi_query") + if multi_query_value: + n_heads = 1 # MQA , multi query is true + else: + n_heads = config.num_attention_heads + d_head = config.hidden_size // config.num_attention_heads + else: + raise ValueError("Invalid model configuration: n_head/d_heads or num_key_value_heads not found.") + padding_shape = [batch_size, n_heads, seq_len, d_head] + if hasattr(config, "architectures") and config.architectures is not None: # Check for Starcoder1 - 3D layout + if "GPTBigCodeForCausalLM" in config.architectures: + padding_shape = [batch_size, seq_len, d_head] + return padding_shape + + +def get_num_layers_from_config(config): + """ + Gets number of layers from model config + -------- + + :config: AutoConfig from pretrained model. + + Return: + number of layers + """ + + if hasattr(config, "n_layer"): # Assuming n_layer is a key in the config (GPTs/CodeGen) + n_layer = config.n_layer + elif hasattr(config, "num_hidden_layers"): # llama/Mistral/Falcon + n_layer = config.num_hidden_layers + elif hasattr(config, "n_layers"): # Check for n_layers in the config (MPT Model) + n_layer = config.n_layers + else: + raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.") + + return n_layer From f06bbcc8b4cb83ac10c99a99bdf0d867be14a7c3 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Thu, 17 Oct 2024 11:53:08 +0530 Subject: [PATCH 3/5] Requested Changes have been made Signed-off-by: Abukhoyer Shaik --- QEfficient/utils/_utils.py | 2 +- QEfficient/utils/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 224935d53..552041fe9 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -71,7 +71,7 @@ def hf_download( logger.error(f"OSError: {e}") if "Consistency check failed" in str(e): logger.info( - "OSError: Consistency check failed: file should not be incomplete, Resuming the downloading..." + "Consistency check failed during model download. The file appears to be incomplete. Resuming the download..." ) else: raise e diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 0bc23a2c2..cf677bb97 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -55,4 +55,4 @@ class Constants: INPUT_STR = ["My name is"] GB = 2**30 MAX_QPC_LIMIT = 30 - MAX_RETRIES = 5 + MAX_RETRIES = 5 # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download From e303768b0c80b2ef1422e3ee7199f7ff9ada2bdf Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Thu, 17 Oct 2024 16:30:16 +0530 Subject: [PATCH 4/5] removed extra lines Signed-off-by: Abukhoyer Shaik --- QEfficient/utils/_utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 552041fe9..3648e7853 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -42,9 +42,8 @@ def hf_download( if cache_dir is not None: os.makedirs(cache_dir, exist_ok=True) - max_retries = Constants.MAX_RETRIES retry_count = 0 - while retry_count < max_retries: + while retry_count < Constants.MAX_RETRIES: try: model_path = snapshot_download( repo_id, @@ -61,7 +60,7 @@ def hf_download( retry_count += 1 except HTTPError as e: - retry_count = max_retries + retry_count = Constants.MAX_RETRIES if e.response.status_code == 401: logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.") else: From 63cde2b296b98326d0e4bf0160026c68affbc61b Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 17 Oct 2024 19:39:56 +0530 Subject: [PATCH 5/5] fixed hf_download function Signed-off-by: Onkar Chougule --- QEfficient/utils/_utils.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 3648e7853..223c58d5c 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -17,6 +17,12 @@ from QEfficient.utils.logging_utils import logger +class DownloadRetryLimitExceeded(Exception): + """ + Used for raising error when hf_download fails to download the model after given max_retries. + """ + + def login_and_download_hf_lm(model_name, *args, **kwargs): logger.info(f"loading HuggingFace model for {model_name}") hf_token = kwargs.pop("hf_token", None) @@ -37,13 +43,14 @@ def hf_download( hf_token: Optional[str] = None, allow_patterns: Optional[List[str]] = None, ignore_patterns: Optional[List[str]] = None, + max_retries: Optional[int] = Constants.MAX_RETRIES, ): # Setup cache_dir if cache_dir is not None: os.makedirs(cache_dir, exist_ok=True) retry_count = 0 - while retry_count < Constants.MAX_RETRIES: + while retry_count < max_retries: try: model_path = snapshot_download( repo_id, @@ -58,23 +65,23 @@ def hf_download( except requests.ReadTimeout as e: logger.info(f"Read timeout: {e}") retry_count += 1 - except HTTPError as e: - retry_count = Constants.MAX_RETRIES if e.response.status_code == 401: logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.") - else: - raise e - + raise e except OSError as e: - logger.error(f"OSError: {e}") if "Consistency check failed" in str(e): logger.info( "Consistency check failed during model download. The file appears to be incomplete. Resuming the download..." ) + retry_count += 1 else: raise e + if retry_count >= max_retries: + raise DownloadRetryLimitExceeded( + f"Unable to download full model after {max_retries} tries. If the model fileS are huge in size, please try again." + ) return model_path