In [1]:
from models import *
from utils import *  
import hook 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Module, Parameter
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


## Test QuantAct


In [None]:

x = torch.randn(1, 768).to(device)
quant_act = QuantAct(activation_bit=8, quant_mode='symmetric').to(device)
output, scale = quant_act(x)

In [14]:
print("Output:", output)
print("Scale:", scale)  

Output: tensor([[ 0.0552, -0.3588, -0.9661,  2.8983,  1.2697, -1.1869, -0.3588, -0.6901,
         -0.4140,  1.5457, -0.5244, -0.9937,  0.0276, -0.5520, -1.2421, -0.8005,
         -0.9661,  0.9385,  0.8281,  0.4140,  0.8557,  0.9109, -0.6349,  0.1104,
         -1.3801,  0.4968,  1.2697, -0.6901, -1.1869,  0.4140, -0.5797, -0.2208,
          0.6349, -0.0276, -0.2760,  0.5520,  2.3186, -0.0276, -0.9385, -0.4968,
          1.4629, -0.3312, -2.2082, -0.2484, -0.1656,  0.4968,  0.6625,  0.1932,
         -0.5520, -0.6349, -1.1041, -0.3312,  0.6901, -0.5520,  0.2484,  0.1656,
          0.0276, -0.5244,  1.7114, -0.2760,  1.1317, -0.7177, -1.9322,  0.7453,
         -1.9322, -0.5244,  1.6009,  0.3864,  0.0552, -0.2484, -1.1041, -0.0000,
         -1.2145,  1.0213, -1.0765, -0.4416,  0.7453, -1.5457,  1.6285,  1.6561,
         -0.1656,  1.0213,  0.8005,  0.3864,  1.1869, -0.0276, -0.2208, -0.1932,
          0.3864,  0.0000,  1.1593,  0.1656, -1.6285, -1.0213, -1.0489, -0.6073,
          0.2484, -0

In [17]:
recovered_int = (output / scale).round()
print("Recovered INT:", recovered_int)


Recovered INT: tensor([[   2.,  -13.,  -35.,  105.,   46.,  -43.,  -13.,  -25.,  -15.,   56.,
          -19.,  -36.,    1.,  -20.,  -45.,  -29.,  -35.,   34.,   30.,   15.,
           31.,   33.,  -23.,    4.,  -50.,   18.,   46.,  -25.,  -43.,   15.,
          -21.,   -8.,   23.,   -1.,  -10.,   20.,   84.,   -1.,  -34.,  -18.,
           53.,  -12.,  -80.,   -9.,   -6.,   18.,   24.,    7.,  -20.,  -23.,
          -40.,  -12.,   25.,  -20.,    9.,    6.,    1.,  -19.,   62.,  -10.,
           41.,  -26.,  -70.,   27.,  -70.,  -19.,   58.,   14.,    2.,   -9.,
          -40.,   -0.,  -44.,   37.,  -39.,  -16.,   27.,  -56.,   59.,   60.,
           -6.,   37.,   29.,   14.,   43.,   -1.,   -8.,   -7.,   14.,    0.,
           42.,    6.,  -59.,  -37.,  -38.,  -22.,    9.,  -36.,  -80.,    9.,
          -53.,   31.,   65.,  -23.,   58.,  -23.,    2.,   54.,   12.,  -30.,
           -3.,   15.,   -5.,    8.,  -47.,  -23.,    7.,   20.,    7.,  -16.,
          -12.,    9.,   71.,   -0., 

In [None]:
int_repr()

In [16]:
output2 = recovered_int * scale
print(torch.allclose(output, output2, atol=1e-5))  # True 表示量化 + 還原沒誤

True


## IntGelu

In [20]:

class IntGELU(Module):
    """
    Class to quantize given GELU layer

    Parameters:
    ----------
    quant_mode : 'none' or 'symmetric', default 'none'
        The mode for quantization. 'none' for no quantization.
    force_dequant : str, default 'none'
        Force dequantize GELU if either 'gelu' or 'nonlinear' is given.
    """
    def __init__(self,
                 quant_mode='none',
                 force_dequant='none'):
        super(IntGELU, self).__init__()
        self.register_buffer('input_scaling_factor', torch.ones(1))
        self.quant_mode = quant_mode
        if force_dequant in ['nonlinear', 'gelu']:
            logger.info("Force dequantize gelu")
            self.quant_mode = 'none'


        if self.quant_mode == 'none':
            self.activation_fn = nn.GELU()
        elif self.quant_mode == 'symmetric':
            pass
        elif quant_mode == "asymmetric":
            raise NotImplementedError("unsupported quant mode: {}".format(self.quant_mode))
        else:
            raise ValueError("unknown quant mode: {}".format(quant_mode))

        self.k = 1.4142
        self.n = 14 # sufficiently large integer
        self.coeff = [-0.2888, -1.769, 1] # a(x+b)**2 + c
        self.coeff[2] /= self.coeff[0]

    def fix(self):
        pass

    def unfix(self):
        pass

    def int_erf(self, x_int, scaling_factor):
        with torch.no_grad():
            b_int = torch.floor(self.coeff[1] / scaling_factor)
            c_int = torch.floor(self.coeff[2] / scaling_factor ** 2)

        with torch.no_grad():
            sign = torch.sign(x_int)
        abs_int = torch.abs(x_int)
        abs_int = torch.min(abs_int, -b_int)
        y_int = (abs_int + b_int) ** 2 + c_int
        y_int = sign * y_int
        scaling_factor = scaling_factor ** 2 * self.coeff[0]
        y_int = floor_ste.apply(y_int / 2 ** self.n)
        scaling_factor = scaling_factor * 2 ** self.n
        
        return y_int, scaling_factor

    def forward(self, x, scaling_factor=None):
        if self.quant_mode == 'none':
            return self.activation_fn(x), None

        assert self.quant_mode == 'symmetric', \
                "unsupported quant mode: {}".format(quant_mode)

        x_int = x / scaling_factor
        sigmoid_int, sigmoid_scaling_factor = self.int_erf(x_int, scaling_factor / self.k)

        shift_int = torch.floor(1. / sigmoid_scaling_factor)

        x_int = x_int * (sigmoid_int + shift_int)
        scaling_factor = scaling_factor * sigmoid_scaling_factor / 2

        return x_int * scaling_factor, scaling_factor

In [21]:
from torch.autograd import Function
# floor STE（讓整數化時的非可導操作能反向傳播）
class floor_ste(Function):
    @staticmethod
    def forward(ctx, x):
        return torch.floor(x)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output

# IntGELU 請假設你已經事先定義好

def test_intgelu():
    # 1. 選擇 device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 2. 建立浮點輸入資料
    x_fp = torch.randn(1, 128).to(device)
    act_bit = 8
    qmax = 2 ** (act_bit - 1) - 1

    # 3. 手動計算 scaling factor 並量化
    max_val = x_fp.abs().max()
    scale = (max_val / qmax).clamp(min=1e-8)
    x_int = torch.clamp((x_fp / scale).round(), -qmax, qmax).to(torch.int32)

    # 4. 建立 IntGELU 模型並移到 device
    gelu = IntGELU(quant_mode='symmetric').to(device)

    # 5. 執行 IntGELU 推論（注意要轉成 float 傳入）
    out_intgelu, out_scale = gelu(x_int.float(), scale)

    # 6. 參考真實 GELU 結果
    gelu_ref = nn.GELU().to(device)
    out_fp = gelu_ref(x_fp)

    # 7. 將 intgelu 輸出還原為 float 做比較
    out_approx = out_intgelu / out_scale

    # 8. 印出測試結果
    print("=== IntGELU 測試 ===")
    print("原始輸入 (float):", x_fp[0, :5])
    print("浮點 GELU:", out_fp[0, :5])
    print("整數近似 GELU:", out_approx[0, :5])
    print("MSE:", torch.mean((out_fp - out_approx) ** 2).item())

test_intgelu()

=== IntGELU 測試 ===
原始輸入 (float): tensor([-0.6101,  2.5926, -0.7107, -0.8551, -0.3036], device='cuda:0')
浮點 GELU: tensor([-0.1653,  2.5803, -0.1696, -0.1678, -0.1156], device='cuda:0')
整數近似 GELU: tensor([  1469.5619, -24884.5801,   1714.4888,   2057.3865,    734.7809],
       device='cuda:0')
MSE: 57368348.0


## tryQLinear

In [7]:
w = torch.randn(3, 4).to(device)
print(w.shape[0])
print(w.shape[1])
print(w)

3
4
tensor([[ 0.0839, -0.8119,  0.5391,  1.3295],
        [-1.2138, -0.2571,  0.4485,  2.0543],
        [-0.2641,  0.2417, -0.1684, -0.4287]], device='cuda:0')


In [5]:
qact = QuantAct()
x = torch.randn(2, 3).to(device)
quant_linear_test = QuantLinear(3, 4, weight_bit=8, quant_mode='symmetric').to(device)

In [6]:
print(x)

tensor([[ 1.4880, -1.3811,  1.4525],
        [ 0.9484,  1.3777, -1.1465]], device='cuda:0')


In [9]:
x1, scale = qact(x)
print("Quantized Output:", x1)
print("Scale:", scale)

Quantized Output: tensor([[ 1.4880, -1.3826,  1.4529],
        [ 0.9490,  1.3826, -1.1482]], device='cuda:0')
Scale: tensor(0.0117, device='cuda:0')


In [11]:
quanted_act = x1 / scale
print("Quantized Activation:", quanted_act) 

Quantized Activation: tensor([[ 127., -118.,  124.],
        [  81.,  118.,  -98.]], device='cuda:0')


In [14]:
w1, w_scale = quant_linear_test(x1, scale)
print(w_scale)
print("Weights:", w1)
quanted_w = w1 / w_scale
print("Quantized Weights:", quanted_w)


tensor([4.6942e-05, 4.6940e-05, 3.4805e-05, 3.8446e-05], device='cuda:0')
Weights: tensor([[-1.3432, -0.1147, -0.7765,  1.1408],
        [-0.0747, -1.9044,  0.2433,  0.9682]], device='cuda:0',
       grad_fn=<MulBackward0>)
Quantized Weights: tensor([[-28613.,  -2444., -22309.,  29672.],
        [ -1591., -40570.,   6989.,  25184.]], device='cuda:0',
       grad_fn=<DivBackward0>)


In [3]:
reshp = torch.randn(2, 3, 4).to(device)
reshp = reshp.reshape(-1, reshp.shape[-1])
print(reshp.shape)


torch.Size([6, 4])


In [3]:
from models.vit_quant import *

In [4]:
model = deit_tiny_patch16_224(pretrained=False)

In [5]:
print(model)

AttributeError: 'QuantAct' object has no attribute 'x_min'

In [None]:
from models.vit_quant import Attention


In [3]:
import torch


checkpoint = torch.load("checkpoint.pth.tar", map_location='cpu')

In [None]:
import torch
import torch.nn.functional as F

# 假设 model 是你的完整 ViT 模型，
# 并且你已经准备好一个输入张量 input_tensor: shape [B, N, C]
# （比如 embedding 之后加上 cls_token、pos_embed，再 dropout 之后的结果）。
# 这里我们直接用随机数模拟：
B, N, C = 1, 198, 192
input_tensor = torch.randn(B, N, C)

# 1. 直接用隨機的 x_norm，不再通過 model.blocks[0].norm1
x_norm = torch.randn(B, N, C)  # 隨機生成代替原本的 norm1 輸出

# 2. 對 x_norm 做 symmetric quantization 計算 scaling factor
# Symmetric quantization: α = max(abs(x)) / (2^(bit-1) - 1)
# 假設用 8-bit quantization

n_bits = 8
max_val = 2**(n_bits-1) - 1  # 127 for 8-bit
α_in = torch.max(torch.abs(x_norm), dim=-1, keepdim=True)[0] / max_val  # shape [B, N, 1]
# 或者如果你想要 per-channel scaling (shape=[C])：
# α_in = torch.max(torch.abs(x_norm.view(-1, C)), dim=0)[0] / max_val  # shape [C]

# 使用 per-channel scaling factor
α_in = torch.max(torch.abs(x_norm.view(-1, C)), dim=0)[0] / max_val  # shape [C]

# 3. 量化浮點 x_norm 到整數域
x_int = (x_norm / α_in).round()  # shape [B, N, C]

# 4. 現在就有了量化後的輸入 activation x_int，可以照之前的流程
# 拿到 proj 層的整數權重和偏置，然後在整數域裡做線性：
Wq_int = checkpoint["blocks.0.attn.qkv.weight_integer"][:C, :]  # Q 部分
bq_int = checkpoint["blocks.0.attn.qkv.bias_integer"][:C]
q_int = F.linear(x_int, Wq_int, bq_int)  # [B, N, C]

# 5. 恢復到浮點域
scale_w_q = checkpoint["blocks.0.attn.qkv.fc_scaling_factor"][:C]  # [C]
α_out_q = scale_w_q * α_in
q_fp = q_int * α_out_q

# 保存各個階段的整數結果到 txt 文件
import numpy as np

# 1. 保存 q_proj 的 input activation (int8)
x_int_np = x_int.detach().cpu().numpy().astype(np.int8)
np.savetxt('q_proj_input_activation_int8.txt', x_int_np.reshape(-1, C), fmt='%d', delimiter=',')
print(f"Saved q_proj input activation (int8) shape: {x_int_np.shape}")

# 2. 保存 q_proj 的 weight (int)
Wq_int_np = Wq_int.detach().cpu().numpy().astype(np.int32)
np.savetxt('q_proj_weight_int.txt', Wq_int_np, fmt='%d', delimiter=',')
print(f"Saved q_proj weight (int) shape: {Wq_int_np.shape}")

# 3. 保存 q_proj 的 bias (int32)
bq_int_np = bq_int.detach().cpu().numpy().astype(np.int32)
np.savetxt('q_proj_bias_int32.txt', bq_int_np.reshape(1, -1), fmt='%d', delimiter=',')
print(f"Saved q_proj bias (int32) shape: {bq_int_np.shape}")

# 4. 保存 q_proj 的 output (int32) - 這是線性層的直接輸出
q_int_np = q_int.detach().cpu().numpy().astype(np.int32)
np.savetxt('q_proj_output_int32.txt', q_int_np.reshape(-1, C), fmt='%d', delimiter=',')
print(f"Saved q_proj output (int32) shape: {q_int_np.shape}")

# 5. 保存 scaling factors 供參考
scaling_factors = {
    'alpha_in': α_in.detach().cpu().numpy(),
    'scale_w_q': scale_w_q.detach().cpu().numpy(),
    'alpha_out_q': α_out_q.detach().cpu().numpy()
}
np.savetxt('scaling_factors.txt', np.column_stack([scaling_factors['alpha_in'], 
                                                  scaling_factors['scale_w_q'], 
                                                  scaling_factors['alpha_out_q']]), 
           fmt='%.8f', delimiter=',', 
           header='alpha_in,scale_w_q,alpha_out_q')

print("x_norm:", x_norm[0,0,:5])
print("α_in (scaling factor):", α_in[:5])
print("x_int:", x_int[0,0,:5])
print("q_int:", q_int[0,0,:5])
print("q_fp:", q_fp[0,0,:5])
print("\nAll integer tensors saved to txt files!")