From df3d6a9ee505c4948f441b70e467ae32ce953186 Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Thu, 17 Oct 2024 04:46:11 +0000
Subject: [PATCH 1/5] Consistency Check Failure is Fixed

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
---
 QEfficient/utils/_utils.py    | 583 +++++++++++++++++-----------------
 QEfficient/utils/constants.py |   1 +
 2 files changed, 297 insertions(+), 287 deletions(-)

diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 3d1620b3b..2150b80f7 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -1,287 +1,296 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-import os
-from typing import List, Optional, Tuple, Union
-
-import requests
-from huggingface_hub import login, snapshot_download
-from requests.exceptions import HTTPError
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-
-from QEfficient.utils.constants import QEFF_MODELS_DIR
-from QEfficient.utils.logging_utils import logger
-
-
-def login_and_download_hf_lm(model_name, *args, **kwargs):
-    logger.info(f"loading HuggingFace model for {model_name}")
-    hf_token = kwargs.pop("hf_token", None)
-    cache_dir = kwargs.pop("cache_dir", None)
-    if hf_token is not None:
-        login(hf_token)
-    model_path = hf_download(
-        repo_id=model_name,
-        cache_dir=cache_dir,
-        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
-    )
-    return model_path
-
-
-def hf_download(
-    repo_id: Optional[str] = None,
-    cache_dir: Optional[str] = None,
-    hf_token: Optional[str] = None,
-    allow_patterns: Optional[List[str]] = None,
-    ignore_patterns: Optional[List[str]] = None,
-):
-    # Setup cache_dir
-    if cache_dir is not None:
-        os.makedirs(cache_dir, exist_ok=True)
-
-    max_retries = 5
-    retry_count = 0
-    while retry_count < max_retries:
-        try:
-            model_path = snapshot_download(
-                repo_id,
-                cache_dir=cache_dir,
-                revision="main",
-                resume_download=True,
-                token=hf_token,
-                allow_patterns=allow_patterns,
-                ignore_patterns=ignore_patterns,
-            )
-            break
-        except requests.ReadTimeout as e:
-            logger.info(f"Read timeout: {e}")
-            retry_count += 1
-
-        except HTTPError as e:
-            retry_count = max_retries
-            if e.response.status_code == 401:
-                logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.")
-            else:
-                raise e
-
-    return model_path
-
-
-def qpc_exists(qpc_dir_path: str) -> bool:
-    """
-    Checks if qpc dir exists.
-    Returns
-    1. Boolean variable indicating if qpc files exist
-    2. Path of the qpc dir if found.
-    ---------
-
-    :model_name: `str` - HF Model card name.
-    :dir_path: `str` - Path of qpc directory.
-
-    Return:
-        qpc_exists and path to qpc directory
-    """
-
-    # Compute the boolean indicating if the QPC exists
-    qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))
-
-    return qpc_exists_bool
-
-
-def get_onnx_dir_name(model_name, has_fbs):
-    # Create a unique directory name for the ONNX model
-    # Clearly indicate whether it's with or without FBS
-    # Replace all hyphens with underscores
-    model_name_safe = model_name.replace("/", "_").replace("-", "_")
-    if has_fbs:
-        return f"onnx_{model_name_safe}_with_fbs"
-    else:
-        return f"onnx_{model_name_safe}_without_fbs"
-
-
-def onnx_exists(model_name: str, full_batch_size: int) -> Tuple[bool, str, str]:
-    """
-    Checks if qpc files already exists, removes the directory if files have been manipulated.
-    ---------
-
-    :model_name: `str`- HF Model card name.
-
-    Return:
-        onnx_exists and path to onnx file and directory
-    """
-    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-    os.makedirs(model_card_dir, exist_ok=True)
-
-    # Determine if we're using full_batch_size
-    has_fbs = full_batch_size is not None
-
-    # ONNX handling
-    onnx_dir_name = get_onnx_dir_name(model_name, has_fbs)
-    onnx_dir_path = os.path.join(model_card_dir, onnx_dir_name)
-    os.makedirs(onnx_dir_path, exist_ok=True)
-    clipped_onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
-    unclipped_onnx_model_path = clipped_onnx_model_path.replace("_clipped_fp16.onnx", ".onnx")
-
-    # Compute the boolean indicating if the ONNX model exists
-    onnx_exists_bool = False
-    onnx_model_path = None
-    if os.path.isfile(os.path.join(onnx_dir_path, "custom_io_fp16.yaml")):
-        if os.path.isfile(clipped_onnx_model_path):
-            onnx_exists_bool = True
-            onnx_model_path = clipped_onnx_model_path
-        elif os.path.isfile(unclipped_onnx_model_path):
-            onnx_exists_bool = True
-            onnx_model_path = unclipped_onnx_model_path
-
-    # Return the boolean, onnx_dir_path, and onnx_model_path
-    return onnx_exists_bool, onnx_dir_path, onnx_model_path
-
-
-def load_hf_tokenizer(
-    pretrained_model_name_or_path: str,
-    cache_dir: Optional[str] = None,
-    hf_token: Optional[str] = None,
-    padding_side: str = "right",
-    **kwargs,
-) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
-    # FIXME: Fix kwargs to take token, cache_dir and pass via kwargs only on line 129
-    logger.info("Loading Tokenizer")
-    if hf_token is not None:
-        login(hf_token)
-    # Download tokenizer along with model if it doesn't exist
-    model_hf_path = (
-        pretrained_model_name_or_path
-        if os.path.isdir(pretrained_model_name_or_path)
-        else hf_download(
-            repo_id=pretrained_model_name_or_path,
-            cache_dir=cache_dir,
-            allow_patterns=["*.json", "*.py", "*token*", "*.txt"],
-        )
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs
-    )
-    padding_check_and_fix(tokenizer)  # Check and fix tokenizer viability
-
-    return tokenizer
-
-
-def get_qpc_dir_path(
-    model_card_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size
-):
-    # Create a unique directory name for the QPC model based on all parameters
-    qpc_base_dir_name = (
-        f"qpc_{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos"
-        + f"{f'_{full_batch_size}fbs_' if full_batch_size is not None else '_'}"
-        + f"{len(device_group) if device_group is not None else 1}"
-        + "devices"
-        + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
-    )
-    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_card_name))
-    os.makedirs(model_card_dir, exist_ok=True)
-
-    qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs")
-    return qpc_dir_path
-
-
-def check_and_assign_cache_dir(local_model_dir, cache_dir):
-    if local_model_dir is not None:
-        if cache_dir is not None:
-            logger.warning(
-                f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir."
-            )
-        return None
-    return cache_dir if cache_dir else None
-
-
-def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None:
-    """
-    Checks and fixes tokenizer padding side and pad_token_id viability.
-    --------
-
-    tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix.
-    """
-    if tokenizer.padding_side != "right":
-        logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}")
-        tokenizer.padding_side = "right"
-
-    if tokenizer.pad_token_id is None:
-        assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int"
-        # If Pad token is out of range of vocab size
-        if tokenizer.eos_token_id < tokenizer.vocab_size:
-            tokenizer.pad_token_id = tokenizer.eos_token_id
-        else:
-            tokenizer.pad_token_id = tokenizer.vocab_size - 1
-
-
-def get_padding_shape_from_config(config, batch_size, seq_len):
-    """
-    Gets padding dims from model config - number of kv heads and d_head
-    and returns padding shape - (batch_size, number of kv heads, seq_len, hidden size)
-    required for initialization of past_key_values
-    --------
-
-    :config: AutoConfig from pretrained model.
-    :batch_size: int. number of input prompts used to create inputs
-    :seq_len: int. sequence length to run the model for.
-
-    Return:
-        List[int, int, int, int]
-    """
-
-    if hasattr(config, "n_head"):  # Assuming n_head is a key in the config (GPTs/CodeGen)
-        n_heads = config.n_head
-        d_head = config.n_embd // config.n_head
-    elif hasattr(config, "num_key_value_heads") and hasattr(
-        config, "num_attention_heads"
-    ):  # Check for num_key_value_heads (Llama/Mistral)
-        n_heads = config.num_key_value_heads
-        d_head = config.hidden_size // config.num_attention_heads
-    elif hasattr(config, "n_heads"):  # Check for n_heads and d_model in the config (MPT Model)
-        n_heads = config.n_heads
-        d_head = config.d_model // config.n_heads
-    elif hasattr(config, "new_decoder_architecture"):  # Check for Falcon
-        new_decoder_architecture = getattr(config, "new_decoder_architecture")
-        if new_decoder_architecture:  # multi_query is ignored when new_decoder_architecture is True
-            n_heads = config.num_attention_heads
-        else:
-            if hasattr(config, "multi_query"):
-                multi_query_value = getattr(config, "multi_query")
-                if multi_query_value:
-                    n_heads = 1  # MQA , multi query is true
-                else:
-                    n_heads = config.num_attention_heads
-        d_head = config.hidden_size // config.num_attention_heads
-    else:
-        raise ValueError("Invalid model configuration: n_head/d_heads or num_key_value_heads not found.")
-    padding_shape = [batch_size, n_heads, seq_len, d_head]
-    if hasattr(config, "architectures") and config.architectures is not None:  # Check for Starcoder1 - 3D layout
-        if "GPTBigCodeForCausalLM" in config.architectures:
-            padding_shape = [batch_size, seq_len, d_head]
-    return padding_shape
-
-
-def get_num_layers_from_config(config):
-    """
-    Gets number of layers from model config
-    --------
-
-    :config: AutoConfig from pretrained model.
-
-    Return:
-        number of layers
-    """
-
-    if hasattr(config, "n_layer"):  # Assuming n_layer is a key in the config (GPTs/CodeGen)
-        n_layer = config.n_layer
-    elif hasattr(config, "num_hidden_layers"):  # llama/Mistral/Falcon
-        n_layer = config.num_hidden_layers
-    elif hasattr(config, "n_layers"):  # Check for n_layers in the config (MPT Model)
-        n_layer = config.n_layers
-    else:
-        raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.")
-
-    return n_layer
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import os
+from typing import List, Optional, Tuple, Union
+
+import requests
+from huggingface_hub import login, snapshot_download
+from requests.exceptions import HTTPError
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
+from QEfficient.utils.logging_utils import logger
+
+
+def login_and_download_hf_lm(model_name, *args, **kwargs):
+    logger.info(f"loading HuggingFace model for {model_name}")
+    hf_token = kwargs.pop("hf_token", None)
+    cache_dir = kwargs.pop("cache_dir", None)
+    if hf_token is not None:
+        login(hf_token)
+    model_path = hf_download(
+        repo_id=model_name,
+        cache_dir=cache_dir,
+        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
+    )
+    return model_path
+
+
+def hf_download(
+    repo_id: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    allow_patterns: Optional[List[str]] = None,
+    ignore_patterns: Optional[List[str]] = None,
+):
+    # Setup cache_dir
+    if cache_dir is not None:
+        os.makedirs(cache_dir, exist_ok=True)
+
+    max_retries = Constants.MAX_RETRIES
+    retry_count = 0
+    while retry_count < max_retries:
+        try:
+            model_path = snapshot_download(
+                repo_id,
+                cache_dir=cache_dir,
+                revision="main",
+                resume_download=True,
+                token=hf_token,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+            )
+            break
+        except requests.ReadTimeout as e:
+            logger.info(f"Read timeout: {e}")
+            retry_count += 1
+
+        except HTTPError as e:
+            retry_count = max_retries
+            if e.response.status_code == 401:
+                logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.")
+            else:
+                raise e
+
+        except OSError as e:
+            logger.error(f"OSError: {e}")
+            if "Consistency check failed" in str(e):
+                logger.info(
+                    "OSError: Consistency check failed: file should not be incomplete, Resuming the downloading..."
+                )
+            else:
+                raise e
+
+    return model_path
+
+
+def qpc_exists(qpc_dir_path: str) -> bool:
+    """
+    Checks if qpc dir exists.
+    Returns
+    1. Boolean variable indicating if qpc files exist
+    2. Path of the qpc dir if found.
+    ---------
+
+    :model_name: `str` - HF Model card name.
+    :dir_path: `str` - Path of qpc directory.
+
+    Return:
+        qpc_exists and path to qpc directory
+    """
+
+    # Compute the boolean indicating if the QPC exists
+    qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))
+
+    return qpc_exists_bool
+
+
+def get_onnx_dir_name(model_name, has_fbs):
+    # Create a unique directory name for the ONNX model
+    # Clearly indicate whether it's with or without FBS
+    # Replace all hyphens with underscores
+    model_name_safe = model_name.replace("/", "_").replace("-", "_")
+    if has_fbs:
+        return f"onnx_{model_name_safe}_with_fbs"
+    else:
+        return f"onnx_{model_name_safe}_without_fbs"
+
+
+def onnx_exists(model_name: str, full_batch_size: int) -> Tuple[bool, str, str]:
+    """
+    Checks if qpc files already exists, removes the directory if files have been manipulated.
+    ---------
+
+    :model_name: `str`- HF Model card name.
+
+    Return:
+        onnx_exists and path to onnx file and directory
+    """
+    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
+    os.makedirs(model_card_dir, exist_ok=True)
+
+    # Determine if we're using full_batch_size
+    has_fbs = full_batch_size is not None
+
+    # ONNX handling
+    onnx_dir_name = get_onnx_dir_name(model_name, has_fbs)
+    onnx_dir_path = os.path.join(model_card_dir, onnx_dir_name)
+    os.makedirs(onnx_dir_path, exist_ok=True)
+    clipped_onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
+    unclipped_onnx_model_path = clipped_onnx_model_path.replace("_clipped_fp16.onnx", ".onnx")
+
+    # Compute the boolean indicating if the ONNX model exists
+    onnx_exists_bool = False
+    onnx_model_path = None
+    if os.path.isfile(os.path.join(onnx_dir_path, "custom_io_fp16.yaml")):
+        if os.path.isfile(clipped_onnx_model_path):
+            onnx_exists_bool = True
+            onnx_model_path = clipped_onnx_model_path
+        elif os.path.isfile(unclipped_onnx_model_path):
+            onnx_exists_bool = True
+            onnx_model_path = unclipped_onnx_model_path
+
+    # Return the boolean, onnx_dir_path, and onnx_model_path
+    return onnx_exists_bool, onnx_dir_path, onnx_model_path
+
+
+def load_hf_tokenizer(
+    pretrained_model_name_or_path: str,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    padding_side: str = "right",
+    **kwargs,
+) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
+    # FIXME: Fix kwargs to take token, cache_dir and pass via kwargs only on line 129
+    logger.info("Loading Tokenizer")
+    if hf_token is not None:
+        login(hf_token)
+    # Download tokenizer along with model if it doesn't exist
+    model_hf_path = (
+        pretrained_model_name_or_path
+        if os.path.isdir(pretrained_model_name_or_path)
+        else hf_download(
+            repo_id=pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            allow_patterns=["*.json", "*.py", "*token*", "*.txt"],
+        )
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs
+    )
+    padding_check_and_fix(tokenizer)  # Check and fix tokenizer viability
+
+    return tokenizer
+
+
+def get_qpc_dir_path(
+    model_card_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size
+):
+    # Create a unique directory name for the QPC model based on all parameters
+    qpc_base_dir_name = (
+        f"qpc_{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos"
+        + f"{f'_{full_batch_size}fbs_' if full_batch_size is not None else '_'}"
+        + f"{len(device_group) if device_group is not None else 1}"
+        + "devices"
+        + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
+    )
+    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_card_name))
+    os.makedirs(model_card_dir, exist_ok=True)
+
+    qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs")
+    return qpc_dir_path
+
+
+def check_and_assign_cache_dir(local_model_dir, cache_dir):
+    if local_model_dir is not None:
+        if cache_dir is not None:
+            logger.warning(
+                f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir."
+            )
+        return None
+    return cache_dir if cache_dir else None
+
+
+def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None:
+    """
+    Checks and fixes tokenizer padding side and pad_token_id viability.
+    --------
+
+    tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix.
+    """
+    if tokenizer.padding_side != "right":
+        logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}")
+        tokenizer.padding_side = "right"
+
+    if tokenizer.pad_token_id is None:
+        assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int"
+        # If Pad token is out of range of vocab size
+        if tokenizer.eos_token_id < tokenizer.vocab_size:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        else:
+            tokenizer.pad_token_id = tokenizer.vocab_size - 1
+
+
+def get_padding_shape_from_config(config, batch_size, seq_len):
+    """
+    Gets padding dims from model config - number of kv heads and d_head
+    and returns padding shape - (batch_size, number of kv heads, seq_len, hidden size)
+    required for initialization of past_key_values
+    --------
+
+    :config: AutoConfig from pretrained model.
+    :batch_size: int. number of input prompts used to create inputs
+    :seq_len: int. sequence length to run the model for.
+
+    Return:
+        List[int, int, int, int]
+    """
+
+    if hasattr(config, "n_head"):  # Assuming n_head is a key in the config (GPTs/CodeGen)
+        n_heads = config.n_head
+        d_head = config.n_embd // config.n_head
+    elif hasattr(config, "num_key_value_heads") and hasattr(
+        config, "num_attention_heads"
+    ):  # Check for num_key_value_heads (Llama/Mistral)
+        n_heads = config.num_key_value_heads
+        d_head = config.hidden_size // config.num_attention_heads
+    elif hasattr(config, "n_heads"):  # Check for n_heads and d_model in the config (MPT Model)
+        n_heads = config.n_heads
+        d_head = config.d_model // config.n_heads
+    elif hasattr(config, "new_decoder_architecture"):  # Check for Falcon
+        new_decoder_architecture = getattr(config, "new_decoder_architecture")
+        if new_decoder_architecture:  # multi_query is ignored when new_decoder_architecture is True
+            n_heads = config.num_attention_heads
+        else:
+            if hasattr(config, "multi_query"):
+                multi_query_value = getattr(config, "multi_query")
+                if multi_query_value:
+                    n_heads = 1  # MQA , multi query is true
+                else:
+                    n_heads = config.num_attention_heads
+        d_head = config.hidden_size // config.num_attention_heads
+    else:
+        raise ValueError("Invalid model configuration: n_head/d_heads or num_key_value_heads not found.")
+    padding_shape = [batch_size, n_heads, seq_len, d_head]
+    if hasattr(config, "architectures") and config.architectures is not None:  # Check for Starcoder1 - 3D layout
+        if "GPTBigCodeForCausalLM" in config.architectures:
+            padding_shape = [batch_size, seq_len, d_head]
+    return padding_shape
+
+
+def get_num_layers_from_config(config):
+    """
+    Gets number of layers from model config
+    --------
+
+    :config: AutoConfig from pretrained model.
+
+    Return:
+        number of layers
+    """
+
+    if hasattr(config, "n_layer"):  # Assuming n_layer is a key in the config (GPTs/CodeGen)
+        n_layer = config.n_layer
+    elif hasattr(config, "num_hidden_layers"):  # llama/Mistral/Falcon
+        n_layer = config.num_hidden_layers
+    elif hasattr(config, "n_layers"):  # Check for n_layers in the config (MPT Model)
+        n_layer = config.n_layers
+    else:
+        raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.")
+
+    return n_layer
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 32a9e0e4a..0bc23a2c2 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -55,3 +55,4 @@ class Constants:
     INPUT_STR = ["My name is"]
     GB = 2**30
     MAX_QPC_LIMIT = 30
+    MAX_RETRIES = 5

From 87465ec816272b6fbcb9d7cb120dc96db6d6b021 Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Thu, 17 Oct 2024 10:42:10 +0530
Subject: [PATCH 2/5] linter fixed I

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
---
 QEfficient/utils/_utils.py | 592 ++++++++++++++++++-------------------
 1 file changed, 296 insertions(+), 296 deletions(-)

diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 2150b80f7..224935d53 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -1,296 +1,296 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-import os
-from typing import List, Optional, Tuple, Union
-
-import requests
-from huggingface_hub import login, snapshot_download
-from requests.exceptions import HTTPError
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-
-from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
-from QEfficient.utils.logging_utils import logger
-
-
-def login_and_download_hf_lm(model_name, *args, **kwargs):
-    logger.info(f"loading HuggingFace model for {model_name}")
-    hf_token = kwargs.pop("hf_token", None)
-    cache_dir = kwargs.pop("cache_dir", None)
-    if hf_token is not None:
-        login(hf_token)
-    model_path = hf_download(
-        repo_id=model_name,
-        cache_dir=cache_dir,
-        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
-    )
-    return model_path
-
-
-def hf_download(
-    repo_id: Optional[str] = None,
-    cache_dir: Optional[str] = None,
-    hf_token: Optional[str] = None,
-    allow_patterns: Optional[List[str]] = None,
-    ignore_patterns: Optional[List[str]] = None,
-):
-    # Setup cache_dir
-    if cache_dir is not None:
-        os.makedirs(cache_dir, exist_ok=True)
-
-    max_retries = Constants.MAX_RETRIES
-    retry_count = 0
-    while retry_count < max_retries:
-        try:
-            model_path = snapshot_download(
-                repo_id,
-                cache_dir=cache_dir,
-                revision="main",
-                resume_download=True,
-                token=hf_token,
-                allow_patterns=allow_patterns,
-                ignore_patterns=ignore_patterns,
-            )
-            break
-        except requests.ReadTimeout as e:
-            logger.info(f"Read timeout: {e}")
-            retry_count += 1
-
-        except HTTPError as e:
-            retry_count = max_retries
-            if e.response.status_code == 401:
-                logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.")
-            else:
-                raise e
-
-        except OSError as e:
-            logger.error(f"OSError: {e}")
-            if "Consistency check failed" in str(e):
-                logger.info(
-                    "OSError: Consistency check failed: file should not be incomplete, Resuming the downloading..."
-                )
-            else:
-                raise e
-
-    return model_path
-
-
-def qpc_exists(qpc_dir_path: str) -> bool:
-    """
-    Checks if qpc dir exists.
-    Returns
-    1. Boolean variable indicating if qpc files exist
-    2. Path of the qpc dir if found.
-    ---------
-
-    :model_name: `str` - HF Model card name.
-    :dir_path: `str` - Path of qpc directory.
-
-    Return:
-        qpc_exists and path to qpc directory
-    """
-
-    # Compute the boolean indicating if the QPC exists
-    qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))
-
-    return qpc_exists_bool
-
-
-def get_onnx_dir_name(model_name, has_fbs):
-    # Create a unique directory name for the ONNX model
-    # Clearly indicate whether it's with or without FBS
-    # Replace all hyphens with underscores
-    model_name_safe = model_name.replace("/", "_").replace("-", "_")
-    if has_fbs:
-        return f"onnx_{model_name_safe}_with_fbs"
-    else:
-        return f"onnx_{model_name_safe}_without_fbs"
-
-
-def onnx_exists(model_name: str, full_batch_size: int) -> Tuple[bool, str, str]:
-    """
-    Checks if qpc files already exists, removes the directory if files have been manipulated.
-    ---------
-
-    :model_name: `str`- HF Model card name.
-
-    Return:
-        onnx_exists and path to onnx file and directory
-    """
-    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
-    os.makedirs(model_card_dir, exist_ok=True)
-
-    # Determine if we're using full_batch_size
-    has_fbs = full_batch_size is not None
-
-    # ONNX handling
-    onnx_dir_name = get_onnx_dir_name(model_name, has_fbs)
-    onnx_dir_path = os.path.join(model_card_dir, onnx_dir_name)
-    os.makedirs(onnx_dir_path, exist_ok=True)
-    clipped_onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
-    unclipped_onnx_model_path = clipped_onnx_model_path.replace("_clipped_fp16.onnx", ".onnx")
-
-    # Compute the boolean indicating if the ONNX model exists
-    onnx_exists_bool = False
-    onnx_model_path = None
-    if os.path.isfile(os.path.join(onnx_dir_path, "custom_io_fp16.yaml")):
-        if os.path.isfile(clipped_onnx_model_path):
-            onnx_exists_bool = True
-            onnx_model_path = clipped_onnx_model_path
-        elif os.path.isfile(unclipped_onnx_model_path):
-            onnx_exists_bool = True
-            onnx_model_path = unclipped_onnx_model_path
-
-    # Return the boolean, onnx_dir_path, and onnx_model_path
-    return onnx_exists_bool, onnx_dir_path, onnx_model_path
-
-
-def load_hf_tokenizer(
-    pretrained_model_name_or_path: str,
-    cache_dir: Optional[str] = None,
-    hf_token: Optional[str] = None,
-    padding_side: str = "right",
-    **kwargs,
-) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
-    # FIXME: Fix kwargs to take token, cache_dir and pass via kwargs only on line 129
-    logger.info("Loading Tokenizer")
-    if hf_token is not None:
-        login(hf_token)
-    # Download tokenizer along with model if it doesn't exist
-    model_hf_path = (
-        pretrained_model_name_or_path
-        if os.path.isdir(pretrained_model_name_or_path)
-        else hf_download(
-            repo_id=pretrained_model_name_or_path,
-            cache_dir=cache_dir,
-            allow_patterns=["*.json", "*.py", "*token*", "*.txt"],
-        )
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs
-    )
-    padding_check_and_fix(tokenizer)  # Check and fix tokenizer viability
-
-    return tokenizer
-
-
-def get_qpc_dir_path(
-    model_card_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size
-):
-    # Create a unique directory name for the QPC model based on all parameters
-    qpc_base_dir_name = (
-        f"qpc_{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos"
-        + f"{f'_{full_batch_size}fbs_' if full_batch_size is not None else '_'}"
-        + f"{len(device_group) if device_group is not None else 1}"
-        + "devices"
-        + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
-    )
-    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_card_name))
-    os.makedirs(model_card_dir, exist_ok=True)
-
-    qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs")
-    return qpc_dir_path
-
-
-def check_and_assign_cache_dir(local_model_dir, cache_dir):
-    if local_model_dir is not None:
-        if cache_dir is not None:
-            logger.warning(
-                f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir."
-            )
-        return None
-    return cache_dir if cache_dir else None
-
-
-def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None:
-    """
-    Checks and fixes tokenizer padding side and pad_token_id viability.
-    --------
-
-    tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix.
-    """
-    if tokenizer.padding_side != "right":
-        logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}")
-        tokenizer.padding_side = "right"
-
-    if tokenizer.pad_token_id is None:
-        assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int"
-        # If Pad token is out of range of vocab size
-        if tokenizer.eos_token_id < tokenizer.vocab_size:
-            tokenizer.pad_token_id = tokenizer.eos_token_id
-        else:
-            tokenizer.pad_token_id = tokenizer.vocab_size - 1
-
-
-def get_padding_shape_from_config(config, batch_size, seq_len):
-    """
-    Gets padding dims from model config - number of kv heads and d_head
-    and returns padding shape - (batch_size, number of kv heads, seq_len, hidden size)
-    required for initialization of past_key_values
-    --------
-
-    :config: AutoConfig from pretrained model.
-    :batch_size: int. number of input prompts used to create inputs
-    :seq_len: int. sequence length to run the model for.
-
-    Return:
-        List[int, int, int, int]
-    """
-
-    if hasattr(config, "n_head"):  # Assuming n_head is a key in the config (GPTs/CodeGen)
-        n_heads = config.n_head
-        d_head = config.n_embd // config.n_head
-    elif hasattr(config, "num_key_value_heads") and hasattr(
-        config, "num_attention_heads"
-    ):  # Check for num_key_value_heads (Llama/Mistral)
-        n_heads = config.num_key_value_heads
-        d_head = config.hidden_size // config.num_attention_heads
-    elif hasattr(config, "n_heads"):  # Check for n_heads and d_model in the config (MPT Model)
-        n_heads = config.n_heads
-        d_head = config.d_model // config.n_heads
-    elif hasattr(config, "new_decoder_architecture"):  # Check for Falcon
-        new_decoder_architecture = getattr(config, "new_decoder_architecture")
-        if new_decoder_architecture:  # multi_query is ignored when new_decoder_architecture is True
-            n_heads = config.num_attention_heads
-        else:
-            if hasattr(config, "multi_query"):
-                multi_query_value = getattr(config, "multi_query")
-                if multi_query_value:
-                    n_heads = 1  # MQA , multi query is true
-                else:
-                    n_heads = config.num_attention_heads
-        d_head = config.hidden_size // config.num_attention_heads
-    else:
-        raise ValueError("Invalid model configuration: n_head/d_heads or num_key_value_heads not found.")
-    padding_shape = [batch_size, n_heads, seq_len, d_head]
-    if hasattr(config, "architectures") and config.architectures is not None:  # Check for Starcoder1 - 3D layout
-        if "GPTBigCodeForCausalLM" in config.architectures:
-            padding_shape = [batch_size, seq_len, d_head]
-    return padding_shape
-
-
-def get_num_layers_from_config(config):
-    """
-    Gets number of layers from model config
-    --------
-
-    :config: AutoConfig from pretrained model.
-
-    Return:
-        number of layers
-    """
-
-    if hasattr(config, "n_layer"):  # Assuming n_layer is a key in the config (GPTs/CodeGen)
-        n_layer = config.n_layer
-    elif hasattr(config, "num_hidden_layers"):  # llama/Mistral/Falcon
-        n_layer = config.num_hidden_layers
-    elif hasattr(config, "n_layers"):  # Check for n_layers in the config (MPT Model)
-        n_layer = config.n_layers
-    else:
-        raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.")
-
-    return n_layer
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import os
+from typing import List, Optional, Tuple, Union
+
+import requests
+from huggingface_hub import login, snapshot_download
+from requests.exceptions import HTTPError
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
+from QEfficient.utils.logging_utils import logger
+
+
+def login_and_download_hf_lm(model_name, *args, **kwargs):
+    logger.info(f"loading HuggingFace model for {model_name}")
+    hf_token = kwargs.pop("hf_token", None)
+    cache_dir = kwargs.pop("cache_dir", None)
+    if hf_token is not None:
+        login(hf_token)
+    model_path = hf_download(
+        repo_id=model_name,
+        cache_dir=cache_dir,
+        ignore_patterns=["*.txt", "*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.msgpack", "*.h5"],
+    )
+    return model_path
+
+
+def hf_download(
+    repo_id: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    allow_patterns: Optional[List[str]] = None,
+    ignore_patterns: Optional[List[str]] = None,
+):
+    # Setup cache_dir
+    if cache_dir is not None:
+        os.makedirs(cache_dir, exist_ok=True)
+
+    max_retries = Constants.MAX_RETRIES
+    retry_count = 0
+    while retry_count < max_retries:
+        try:
+            model_path = snapshot_download(
+                repo_id,
+                cache_dir=cache_dir,
+                revision="main",
+                resume_download=True,
+                token=hf_token,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+            )
+            break
+        except requests.ReadTimeout as e:
+            logger.info(f"Read timeout: {e}")
+            retry_count += 1
+
+        except HTTPError as e:
+            retry_count = max_retries
+            if e.response.status_code == 401:
+                logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.")
+            else:
+                raise e
+
+        except OSError as e:
+            logger.error(f"OSError: {e}")
+            if "Consistency check failed" in str(e):
+                logger.info(
+                    "OSError: Consistency check failed: file should not be incomplete, Resuming the downloading..."
+                )
+            else:
+                raise e
+
+    return model_path
+
+
+def qpc_exists(qpc_dir_path: str) -> bool:
+    """
+    Checks if qpc dir exists.
+    Returns
+    1. Boolean variable indicating if qpc files exist
+    2. Path of the qpc dir if found.
+    ---------
+
+    :model_name: `str` - HF Model card name.
+    :dir_path: `str` - Path of qpc directory.
+
+    Return:
+        qpc_exists and path to qpc directory
+    """
+
+    # Compute the boolean indicating if the QPC exists
+    qpc_exists_bool = os.path.isdir(qpc_dir_path) and os.path.isfile(os.path.join(qpc_dir_path, "programqpc.bin"))
+
+    return qpc_exists_bool
+
+
+def get_onnx_dir_name(model_name, has_fbs):
+    # Create a unique directory name for the ONNX model
+    # Clearly indicate whether it's with or without FBS
+    # Replace all hyphens with underscores
+    model_name_safe = model_name.replace("/", "_").replace("-", "_")
+    if has_fbs:
+        return f"onnx_{model_name_safe}_with_fbs"
+    else:
+        return f"onnx_{model_name_safe}_without_fbs"
+
+
+def onnx_exists(model_name: str, full_batch_size: int) -> Tuple[bool, str, str]:
+    """
+    Checks if qpc files already exists, removes the directory if files have been manipulated.
+    ---------
+
+    :model_name: `str`- HF Model card name.
+
+    Return:
+        onnx_exists and path to onnx file and directory
+    """
+    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
+    os.makedirs(model_card_dir, exist_ok=True)
+
+    # Determine if we're using full_batch_size
+    has_fbs = full_batch_size is not None
+
+    # ONNX handling
+    onnx_dir_name = get_onnx_dir_name(model_name, has_fbs)
+    onnx_dir_path = os.path.join(model_card_dir, onnx_dir_name)
+    os.makedirs(onnx_dir_path, exist_ok=True)
+    clipped_onnx_model_path = os.path.join(onnx_dir_path, model_name.replace("/", "_") + "_kv_clipped_fp16.onnx")
+    unclipped_onnx_model_path = clipped_onnx_model_path.replace("_clipped_fp16.onnx", ".onnx")
+
+    # Compute the boolean indicating if the ONNX model exists
+    onnx_exists_bool = False
+    onnx_model_path = None
+    if os.path.isfile(os.path.join(onnx_dir_path, "custom_io_fp16.yaml")):
+        if os.path.isfile(clipped_onnx_model_path):
+            onnx_exists_bool = True
+            onnx_model_path = clipped_onnx_model_path
+        elif os.path.isfile(unclipped_onnx_model_path):
+            onnx_exists_bool = True
+            onnx_model_path = unclipped_onnx_model_path
+
+    # Return the boolean, onnx_dir_path, and onnx_model_path
+    return onnx_exists_bool, onnx_dir_path, onnx_model_path
+
+
+def load_hf_tokenizer(
+    pretrained_model_name_or_path: str,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    padding_side: str = "right",
+    **kwargs,
+) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
+    # FIXME: Fix kwargs to take token, cache_dir and pass via kwargs only on line 129
+    logger.info("Loading Tokenizer")
+    if hf_token is not None:
+        login(hf_token)
+    # Download tokenizer along with model if it doesn't exist
+    model_hf_path = (
+        pretrained_model_name_or_path
+        if os.path.isdir(pretrained_model_name_or_path)
+        else hf_download(
+            repo_id=pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            allow_patterns=["*.json", "*.py", "*token*", "*.txt"],
+        )
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_hf_path, padding_side=padding_side, trust_remote_code=True, **kwargs
+    )
+    padding_check_and_fix(tokenizer)  # Check and fix tokenizer viability
+
+    return tokenizer
+
+
+def get_qpc_dir_path(
+    model_card_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size
+):
+    # Create a unique directory name for the QPC model based on all parameters
+    qpc_base_dir_name = (
+        f"qpc_{num_cores}cores_{batch_size}bs_{prompt_len}pl_{ctx_len}cl_{mos}mos"
+        + f"{f'_{full_batch_size}fbs_' if full_batch_size is not None else '_'}"
+        + f"{len(device_group) if device_group is not None else 1}"
+        + "devices"
+        + ("_mxfp6_mxint8" if (mxfp6 and mxint8) else "_mxfp6" if mxfp6 else "_fp16_mxint8" if mxint8 else "_fp16")
+    )
+    model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_card_name))
+    os.makedirs(model_card_dir, exist_ok=True)
+
+    qpc_dir_path = os.path.join(model_card_dir, qpc_base_dir_name, "qpcs")
+    return qpc_dir_path
+
+
+def check_and_assign_cache_dir(local_model_dir, cache_dir):
+    if local_model_dir is not None:
+        if cache_dir is not None:
+            logger.warning(
+                f"Both local_model_dir ({local_model_dir}) and cache_dir ({cache_dir}) given. Using local_model_dir."
+            )
+        return None
+    return cache_dir if cache_dir else None
+
+
+def padding_check_and_fix(tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None:
+    """
+    Checks and fixes tokenizer padding side and pad_token_id viability.
+    --------
+
+    tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to check and fix.
+    """
+    if tokenizer.padding_side != "right":
+        logger.warning(f"Setting tokenizer padding_side to 'right', got {tokenizer.padding_side}")
+        tokenizer.padding_side = "right"
+
+    if tokenizer.pad_token_id is None:
+        assert tokenizer.eos_token_id is not None, "Found tokenizer.eos_token_id to be None, expected int"
+        # If Pad token is out of range of vocab size
+        if tokenizer.eos_token_id < tokenizer.vocab_size:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        else:
+            tokenizer.pad_token_id = tokenizer.vocab_size - 1
+
+
+def get_padding_shape_from_config(config, batch_size, seq_len):
+    """
+    Gets padding dims from model config - number of kv heads and d_head
+    and returns padding shape - (batch_size, number of kv heads, seq_len, hidden size)
+    required for initialization of past_key_values
+    --------
+
+    :config: AutoConfig from pretrained model.
+    :batch_size: int. number of input prompts used to create inputs
+    :seq_len: int. sequence length to run the model for.
+
+    Return:
+        List[int, int, int, int]
+    """
+
+    if hasattr(config, "n_head"):  # Assuming n_head is a key in the config (GPTs/CodeGen)
+        n_heads = config.n_head
+        d_head = config.n_embd // config.n_head
+    elif hasattr(config, "num_key_value_heads") and hasattr(
+        config, "num_attention_heads"
+    ):  # Check for num_key_value_heads (Llama/Mistral)
+        n_heads = config.num_key_value_heads
+        d_head = config.hidden_size // config.num_attention_heads
+    elif hasattr(config, "n_heads"):  # Check for n_heads and d_model in the config (MPT Model)
+        n_heads = config.n_heads
+        d_head = config.d_model // config.n_heads
+    elif hasattr(config, "new_decoder_architecture"):  # Check for Falcon
+        new_decoder_architecture = getattr(config, "new_decoder_architecture")
+        if new_decoder_architecture:  # multi_query is ignored when new_decoder_architecture is True
+            n_heads = config.num_attention_heads
+        else:
+            if hasattr(config, "multi_query"):
+                multi_query_value = getattr(config, "multi_query")
+                if multi_query_value:
+                    n_heads = 1  # MQA , multi query is true
+                else:
+                    n_heads = config.num_attention_heads
+        d_head = config.hidden_size // config.num_attention_heads
+    else:
+        raise ValueError("Invalid model configuration: n_head/d_heads or num_key_value_heads not found.")
+    padding_shape = [batch_size, n_heads, seq_len, d_head]
+    if hasattr(config, "architectures") and config.architectures is not None:  # Check for Starcoder1 - 3D layout
+        if "GPTBigCodeForCausalLM" in config.architectures:
+            padding_shape = [batch_size, seq_len, d_head]
+    return padding_shape
+
+
+def get_num_layers_from_config(config):
+    """
+    Gets number of layers from model config
+    --------
+
+    :config: AutoConfig from pretrained model.
+
+    Return:
+        number of layers
+    """
+
+    if hasattr(config, "n_layer"):  # Assuming n_layer is a key in the config (GPTs/CodeGen)
+        n_layer = config.n_layer
+    elif hasattr(config, "num_hidden_layers"):  # llama/Mistral/Falcon
+        n_layer = config.num_hidden_layers
+    elif hasattr(config, "n_layers"):  # Check for n_layers in the config (MPT Model)
+        n_layer = config.n_layers
+    else:
+        raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.")
+
+    return n_layer

From f06bbcc8b4cb83ac10c99a99bdf0d867be14a7c3 Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Thu, 17 Oct 2024 11:53:08 +0530
Subject: [PATCH 3/5] Requested Changes have been made

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
---
 QEfficient/utils/_utils.py    | 2 +-
 QEfficient/utils/constants.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 224935d53..552041fe9 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -71,7 +71,7 @@ def hf_download(
             logger.error(f"OSError: {e}")
             if "Consistency check failed" in str(e):
                 logger.info(
-                    "OSError: Consistency check failed: file should not be incomplete, Resuming the downloading..."
+                    "Consistency check failed during model download. The file appears to be incomplete. Resuming the download..."
                 )
             else:
                 raise e
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 0bc23a2c2..cf677bb97 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -55,4 +55,4 @@ class Constants:
     INPUT_STR = ["My name is"]
     GB = 2**30
     MAX_QPC_LIMIT = 30
-    MAX_RETRIES = 5
+    MAX_RETRIES = 5  # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download

From e303768b0c80b2ef1422e3ee7199f7ff9ada2bdf Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Thu, 17 Oct 2024 16:30:16 +0530
Subject: [PATCH 4/5] removed extra lines

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
---
 QEfficient/utils/_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 552041fe9..3648e7853 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -42,9 +42,8 @@ def hf_download(
     if cache_dir is not None:
         os.makedirs(cache_dir, exist_ok=True)
 
-    max_retries = Constants.MAX_RETRIES
     retry_count = 0
-    while retry_count < max_retries:
+    while retry_count < Constants.MAX_RETRIES:
         try:
             model_path = snapshot_download(
                 repo_id,
@@ -61,7 +60,7 @@ def hf_download(
             retry_count += 1
 
         except HTTPError as e:
-            retry_count = max_retries
+            retry_count = Constants.MAX_RETRIES
             if e.response.status_code == 401:
                 logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.")
             else:

From 63cde2b296b98326d0e4bf0160026c68affbc61b Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 17 Oct 2024 19:39:56 +0530
Subject: [PATCH 5/5] fixed hf_download function

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/utils/_utils.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 3648e7853..223c58d5c 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -17,6 +17,12 @@
 from QEfficient.utils.logging_utils import logger
 
 
+class DownloadRetryLimitExceeded(Exception):
+    """
+    Used for raising error when hf_download fails to download the model after given max_retries.
+    """
+
+
 def login_and_download_hf_lm(model_name, *args, **kwargs):
     logger.info(f"loading HuggingFace model for {model_name}")
     hf_token = kwargs.pop("hf_token", None)
@@ -37,13 +43,14 @@ def hf_download(
     hf_token: Optional[str] = None,
     allow_patterns: Optional[List[str]] = None,
     ignore_patterns: Optional[List[str]] = None,
+    max_retries: Optional[int] = Constants.MAX_RETRIES,
 ):
     # Setup cache_dir
     if cache_dir is not None:
         os.makedirs(cache_dir, exist_ok=True)
 
     retry_count = 0
-    while retry_count < Constants.MAX_RETRIES:
+    while retry_count < max_retries:
         try:
             model_path = snapshot_download(
                 repo_id,
@@ -58,23 +65,23 @@ def hf_download(
         except requests.ReadTimeout as e:
             logger.info(f"Read timeout: {e}")
             retry_count += 1
-
         except HTTPError as e:
-            retry_count = Constants.MAX_RETRIES
             if e.response.status_code == 401:
                 logger.info("You need to pass a valid `--hf_token=...` to download private checkpoints.")
-            else:
-                raise e
-
+            raise e
         except OSError as e:
-            logger.error(f"OSError: {e}")
             if "Consistency check failed" in str(e):
                 logger.info(
                     "Consistency check failed during model download. The file appears to be incomplete. Resuming the download..."
                 )
+                retry_count += 1
             else:
                 raise e
 
+    if retry_count >= max_retries:
+        raise DownloadRetryLimitExceeded(
+            f"Unable to download full model after {max_retries} tries. If the model fileS are huge in size, please try again."
+        )
     return model_path