quic
diff --git a/‎QEfficient/base/common.py‎
Lines changed: 0 additions & 1 deletion b/‎QEfficient/base/common.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎QEfficient/customop/matmulnbits.py‎
Lines changed: 19 additions & 23 deletions b/‎QEfficient/customop/matmulnbits.py‎
Lines changed: 19 additions & 23 deletions
diff --git a/‎QEfficient/transformers/models/modeling_auto.py‎
Lines changed: 8 additions & 5 deletions b/‎QEfficient/transformers/models/modeling_auto.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎QEfficient/transformers/quantizers/auto.py‎
Lines changed: 3 additions & 3 deletions b/‎QEfficient/transformers/quantizers/auto.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎QEfficient/transformers/quantizers/awq.py‎
Lines changed: 9 additions & 59 deletions b/‎QEfficient/transformers/quantizers/awq.py‎
Lines changed: 9 additions & 59 deletions
diff --git a/‎QEfficient/transformers/quantizers/gptq.py‎
Lines changed: 73 additions & 0 deletions b/‎QEfficient/transformers/quantizers/gptq.py‎
Lines changed: 73 additions & 0 deletions
@@ -31,7 +31,6 @@ class QEFF_MODEL_TYPE(Enum):
 
     CAUSALLM = "LLM"
     DIFFUSION = "DIFFUSION"
-    AWQ = "AWQ"
 
 
 MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = {
 
@@ -13,7 +13,7 @@
 
 class QuantLinearTorchFunction(torch.autograd.Function):
     @staticmethod
-    def symbolic(g, x, qself_qweight, qself_scales, qself_qzeros, g_idx, bits, groupsize, in_features, out_features):
+    def symbolic(g, x, qself_qweight, qself_scales, qself_qzeros, g_idx, bits, group_size, in_features, out_features):
         input_tuple = (x, qself_qweight, qself_scales, qself_qzeros)
         input_tuple += (g_idx,) if g_idx is not None else ()
         return g.op(
@@ -23,36 +23,32 @@ def symbolic(g, x, qself_qweight, qself_scales, qself_qzeros, g_idx, bits, group
             K_i=in_features,
             N_i=out_features,
             bits_i=bits,
-            block_size_i=groupsize,
+            block_size_i=group_size,
         )
 
     @staticmethod
-    def forward(ctx, x, qself_qweight, qself_scales, qself_qzeros, g_idx, bits, groupsize, in_features, out_features):
+    def forward(ctx, x, qself_qweight, qself_scales, qself_qzeros, g_idx, bits, group_size, in_features, out_features):
         if torch.onnx.is_in_onnx_export():
-            return torch.zeros(x.shape[:-1] + (out_features,), dtype=x.dtype, device=x.device).float()
+            return torch.zeros(x.shape[:-1] + (out_features,), dtype=x.dtype).float()
         fp_weight = dequantize_blockwise_bits(
-            qself_qweight, qself_scales, qself_qzeros, bits, groupsize, g_idx, in_features, out_features
+            qself_qweight, qself_scales, qself_qzeros, bits, group_size, g_idx, in_features, out_features
         )[0].float()
 
         return torch.matmul(x.float(), fp_weight.T.float())
 
 
-def dequantize_blockwise_bits(quant_values, scale, zero_point, bits, groupsize, g_idx, rows, cols):
+def dequantize_blockwise_bits(quant_values, scale, zero_point, bits, group_size, g_idx, rows, cols):
     if bits != 4:
         raise ValueError("Only bits=4 is supported for executing quantized model")
-    if groupsize != 128:
-        raise ValueError("Only groupsize=128 is supported for executing quantized model")
-    expand_quant_value = (
-        quant_values.unsqueeze(-1) >> torch.tensor([[[[0, 4]]]], dtype=torch.int32, device=quant_values.device)
-    ) & 0x0F
+    if group_size != 128:
+        raise ValueError("Only group_size=128 is supported for executing quantized model")
+    expand_quant_value = (quant_values.unsqueeze(-1) >> torch.tensor([[[[0, 4]]]], dtype=torch.int32)) & 0x0F
     expand_quant_value = expand_quant_value.reshape(*quant_values.shape[:-1], -1)
     aligned_scale = scale.reshape(*quant_values.shape[:-1], 1)
     if zero_point.dtype == scale.dtype:
         expand_zero_point = zero_point.reshape(*quant_values.shape[:-1], -1)
     else:
-        expand_zero_point = (
-            zero_point.unsqueeze(-1) >> torch.tensor([[[[0, 4]]]], dtype=torch.int32, device=quant_values.device)
-        ) & 0x0F
+        expand_zero_point = (zero_point.unsqueeze(-1) >> torch.tensor([[[[0, 4]]]], dtype=torch.int32)) & 0x0F
         try:
             expand_zero_point = expand_zero_point.reshape(*quant_values.shape[:-1], -1)
         # FIXME: remove try-except
@@ -79,30 +75,30 @@ def dequantize_blockwise_bits(quant_values, scale, zero_point, bits, groupsize,
 
 
 class QuantLinearORT(nn.Module):
-    def __init__(self, bits, groupsize, in_features, out_features, bias):
+    def __init__(self, bits, group_size, in_features, out_features, bias):
         super().__init__()
         if bits not in [2, 3, 4, 5, 6, 7, 8]:
             raise NotImplementedError("Only 2,4,5,6,7,8 bits are supported.")
         self.in_features = in_features
         self.out_features = out_features
         self.bits = bits
-        self.groupsize = groupsize if groupsize != -1 else in_features
+        self.group_size = group_size if group_size != -1 else in_features
         self.act_order = None
 
-        q_rows = in_features // self.groupsize
+        q_rows = in_features // self.group_size
         self.register_buffer(
             "qweight",
-            torch.zeros((out_features, q_rows, self.groupsize // (8 // bits)), dtype=torch.uint8),
+            torch.zeros((out_features, q_rows, self.group_size // (8 // bits)), dtype=torch.uint8),
         )
         self.register_buffer(
             "qzeros",
             torch.zeros((q_rows + (q_rows & 1)) * (out_features // 8 * self.bits), dtype=torch.uint8),
         )
         self.register_buffer(
-            "scales", torch.zeros((math.ceil(in_features / self.groupsize) * out_features), dtype=torch.float16)
+            "scales", torch.zeros((math.ceil(in_features / self.group_size) * out_features), dtype=torch.float16)
         )
         self.register_buffer(
-            "g_idx", torch.tensor([i // self.groupsize for i in range(in_features)], dtype=torch.int32)
+            "g_idx", torch.tensor([i // self.group_size for i in range(in_features)], dtype=torch.int32)
         )
         if bias:
             self.register_buffer("bias", torch.zeros((out_features), dtype=torch.float16))
@@ -121,13 +117,13 @@ def pack_on_device(self, int_weight, int_zeros):
             raise ValueError("only 4bit is supported by ONNXRUNTIME for now.")
 
         # Order of groups
-        self.act_order = self.g_idx[: self.groupsize // self.bits].sum().item() != 0
+        self.act_order = self.g_idx[: self.group_size // self.bits].sum().item() != 0
 
         intzeros_pt = int_zeros.T if int_zeros.dtype == self.scales.dtype else int_zeros.T.byte()
         scales_pt = self.scales.T.to(int_weight.device)
         intweight_pt = int_weight.byte()
 
-        block_size = self.groupsize
+        block_size = self.group_size
         rows, cols = intweight_pt.shape
         blob_size = block_size // 2
         k_blocks = (rows + block_size - 1) // block_size
@@ -178,7 +174,7 @@ def forward(self, inputs):
             self.qzeros,
             self.g_idx if self.act_order else None,
             self.bits,
-            self.groupsize,
+            self.group_size,
             self.in_features,
             self.out_features,
         )
 
@@ -15,8 +15,9 @@
 from QEfficient.base.modeling_qeff import QEFFBaseModel, Runtime
 from QEfficient.transformers.pytorch_transforms import CBTransform, CustomOpsTransform, KVCacheTransform
 from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
-from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform
+from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform
 from QEfficient.transformers.quantizers.quantizer_awq import QEffAwqConfig
+from QEfficient.transformers.quantizers.quantizer_gptq import QEffGPTQConfig
 from QEfficient.utils import get_qpc_dir_path, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
@@ -167,10 +168,12 @@ def transform(self, **kwargs):
             self._pytorch_transforms.append(CBTransform)
 
         # Update list of pytorch transforms if the model falls in AWQ/GPTQ category
-        if hasattr(self.model.config, "quantization_config") and isinstance(
-            self.model.config.quantization_config, QEffAwqConfig
-        ):
-            self._pytorch_transforms.insert(0, AwqToMatmulNbitsTransform)
+        if hasattr(self.model.config, "quantization_config"):
+            if isinstance(self.model.config.quantization_config, QEffAwqConfig):
+                self._pytorch_transforms.insert(0, AwqToMatmulNbitsTransform)
+
+            if isinstance(self.model.config.quantization_config, QEffGPTQConfig):
+                self._pytorch_transforms.insert(0, GPTQToMatmulNbitsTransform)
 
         for transform in self._pytorch_transforms:
             transform.apply(self.model)
 
@@ -8,10 +8,10 @@
 from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING, AUTO_QUANTIZER_MAPPING
 
 from QEfficient.transformers.quantizers.quantizer_awq import QEffAwqConfig, QEffAwqQuantizer
+from QEfficient.transformers.quantizers.quantizer_gptq import QEffGPTQConfig, QEffGPTQQuantizer
 
-QEFF_AUTO_QUANTIZER_MAPPING = {"awq": QEffAwqQuantizer}
-
-QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING = {"awq": QEffAwqConfig}
+QEFF_AUTO_QUANTIZER_MAPPING = {"awq": QEffAwqQuantizer, "gptq": QEffGPTQQuantizer}
+QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING = {"awq": QEffAwqConfig, "gptq": QEffGPTQConfig}
 
 
 def with_replaced_quantizers(func):
 
@@ -8,25 +8,27 @@
 import torch
 import torch.nn as nn
 
+from QEfficient.transformers.quantizers.quantizer_utils import dequantize_gemm
+
 
 class WQLinear_GEMM(nn.Module):
-    def __init__(self, w_bit, group_size, in_features, out_features, bias):
+    def __init__(self, bits, group_size, in_features, out_features, bias):
         super().__init__()
 
-        if w_bit != 4:
+        if bits != 4:
             raise NotImplementedError("Only 4-bit are supported for now.")
 
         self.in_features = in_features
         self.out_features = out_features
-        self.w_bit = w_bit
+        self.bits = bits
         self.group_size = group_size if group_size != -1 else in_features
 
         # quick sanity check (make sure alignment)
         if self.in_features % self.group_size != 0:
             raise ValueError(
                 f"in_features should be perfectly divisible by group_size, got in_features = {self.in_features}, group_size = {self.group_size} while initializing WQLinear_GEMM module"
             )
-        if out_features % (32 // self.w_bit) != 0:
+        if out_features % (32 // self.bits) != 0:
             raise ValueError(
                 f"out_features must be perfectly divisible by number of weights packed into int32 value i.e. 8, got out_features={self.out_features}"
             )
@@ -36,14 +38,14 @@ def __init__(self, w_bit, group_size, in_features, out_features, bias):
         self.register_buffer(
             "qweight",
             torch.zeros(
-                (in_features, out_features // (32 // self.w_bit)),
+                (in_features, out_features // (32 // self.bits)),
                 dtype=torch.int32,
             ),
         )
         self.register_buffer(
             "qzeros",
             torch.zeros(
-                (in_features // self.group_size, out_features // (32 // self.w_bit)),
+                (in_features // self.group_size, out_features // (32 // self.bits)),
                 dtype=torch.int32,
             ),
         )
@@ -70,62 +72,10 @@ def forward(self, x):
         with torch.no_grad():
             out_shape = x.shape[:-1] + (self.out_features,)
 
-            out = dequantize_gemm(self.qweight, self.qzeros, self.scales, self.w_bit, self.group_size)
+            out = dequantize_gemm(self.qweight, self.qzeros, self.scales, self.bits, self.group_size)
             out = torch.matmul(x.float(), out.float())
 
             out = out + self.bias if self.bias is not None else out
             out = out.reshape(out_shape)
 
         return out
-
-
-def unpack_and_reverse_weights_and_zeros(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
-    shifts = torch.arange(0, 32, bits)
-
-    # unpacking weights column-wise
-    int_weights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
-        torch.int8  # smallest dtype available
-    )
-    int_weights = int_weights.view(int_weights.shape[0], -1)
-
-    # unpacking zeros column-wise
-    int_zeros = torch.bitwise_right_shift(qzeros[:, :, None], shifts[None, None, :]).to(
-        torch.int8  # smallest dtype available
-    )
-    int_zeros = int_zeros.view(int_zeros.shape[0], -1)
-
-    reverse_order_tensor = torch.arange(
-        int_weights.shape[-1],
-        dtype=torch.int32,
-    )
-    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
-    reverse_order_tensor = reverse_order_tensor[:, [0, 4, 1, 5, 2, 6, 3, 7]]
-    reverse_order_tensor = reverse_order_tensor.view(-1)
-
-    int_zeros = int_zeros[:, reverse_order_tensor]
-    int_weights = int_weights[:, reverse_order_tensor]
-
-    return int_weights, int_zeros
-
-
-def unpack_awq_weights(qweight, qzeros, scales, bits):
-    int_weight, int_zeros = unpack_and_reverse_weights_and_zeros(qweight, qzeros, bits)
-
-    # overflow checks
-    int_weight = torch.bitwise_and(int_weight, (2**bits) - 1)
-    int_zeros = torch.bitwise_and(int_zeros, (2**bits) - 1)
-
-    return scales, int_weight, int_zeros
-
-
-def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
-    # Unpack the qweight and qzeros tensors
-    scales, int_weight, int_zeros = unpack_awq_weights(qweight, qzeros, scales, bits)
-
-    # fp16 weights
-    scales = scales.repeat_interleave(group_size, dim=0)
-    int_zeros = int_zeros.repeat_interleave(group_size, dim=0)
-
-    int_weight = (int_weight - int_zeros) * scales
-
-    return int_weight
@@ -0,0 +1,73 @@
+import math
+
+import torch
+from torch import nn
+
+from QEfficient.transformers.quantizers.quantizer_utils import dequantize_gptq
+
+
+class QuantLinearGPTQ(nn.Module):
+    """
+    A quantized linear layer using GPTQ (Generalized Post-Training Quantization).
+    This class supports only 4-bit quantization and is compatible with QuantLinearORT.
+
+    Research paper link- GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers (https://arxiv.org/abs/2210.17323)
+
+    Attributes:
+        in_features (int): The number of input features.
+        out_features (int): The number of output features.
+        bits (int): The number of bits used for quantization (must be 4).
+        act_order (None or bool): The activation order.
+        orig_fp_weight (None or torch.Tensor): The original floating-point weights.
+        maxq (int): The maximum quantization value.
+        group_size (int): The group size for quantization.
+        pack_mode (str): The packing mode, set to "GPTQ".
+        qweight (torch.Tensor): The quantized weight tensor.
+        qzeros (torch.Tensor): The quantized zeros tensor.
+        scales (torch.Tensor): The scales tensor.
+        g_idx (torch.Tensor): The group index tensor.
+        bias (torch.Tensor or None): The bias tensor, if applicable.
+    """
+
+    def __init__(self, bits, group_size, in_features, out_features, bias):
+        super().__init__()
+        if bits != 4:
+            raise NotImplementedError("Only 4 bits are supported.")
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bits = bits
+        self.act_order = None
+        self.orig_fp_weight = None
+        self.maxq = 2**self.bits - 1
+        self.group_size = group_size if group_size != -1 else in_features
+        self.pack_mode = "GPTQ"
+
+        # For compatibility with QuantLinearORT
+        self.register_buffer(
+            "qweight",
+            torch.zeros((in_features // 32 * self.bits, out_features), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros((math.ceil(in_features / self.group_size), out_features // 32 * self.bits), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros((math.ceil(in_features / self.group_size), out_features), dtype=torch.float16),
+        )
+        self.g_idx = torch.tensor([i // group_size for i in range(in_features)], dtype=torch.int32)
+        if bias:
+            self.register_buffer(
+                "bias",
+                torch.zeros((out_features), dtype=torch.float16),
+            )
+        else:
+            self.bias = None
+
+    def forward(self, x):
+        # Only Inference supported
+        out, _, _ = dequantize_gptq(self.qweight.T, self.qzeros, self.scales, self.bits, self.g_idx)
+        out = torch.matmul(x.float(), out.float())
+        out = out + self.bias if self.bias is not None else out
+
+        return out