### 2.2.2 对称动态量化代码实现


In [1]:
import torch
import torch.nn as nn

# 创建简单模型
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(3, 2, bias=False)
        # 设置固定权重便于观察
        self.linear.weight.data = torch.tensor(
            [[0.1234, 0.5678, 0.9012],
             [-0.2468, -0.1357, 0.3579]],
            dtype=torch.float32
        )
    
    def forward(self, x):
        return self.linear(x)

# 创建模型实例
model = SimpleModel()

# 获取第一个权重的原始参数
original_param = model.linear.weight.data
print(f"原参数: {original_param}")

# 动态量化
# torch.ao.quantization.quantize_dynamic 参数详解:
# 1. model: 要进行量化的模型
# 2. {nn.Linear}: 指定要量化的模块类型集合(这里只量化Linear层)
# 3. dtype: 量化数据类型(torch.qint8表示8位有符号整数)
# 返回值: 量化后的模型(只有指定模块被量化，其他保持不变)
model_int8 = torch.ao.quantization.quantize_dynamic(
    model,
    {nn.Linear},
    dtype=torch.qint8 # 8位有符号整数（默认）
)

# 获取量化参数
# 重要: 量化后，model_int8.linear.weight 不再是一个属性，而是一个函数!
# 调用 weight() 返回 PackedParams 对象
quantized_weight_all = model_int8.linear.weight()

# 正确获取参数的方法:
# 1. 获取反量化后的浮点参数(用于计算)
dequantized_param = quantized_weight_all.dequantize()

# 2. 获取int8表示(实际存储的量化值)
int8_param = quantized_weight_all.int_repr()

# 3. 获取量化参数
# q_scale: 量化缩放因子，用于将浮点数映射到整数范围
# q_zero_point: 零点偏移，表示浮点0对应的整数值
print(f"量化后(int8): {int8_param}")
print(f"反量化后: {dequantized_param}")
print(f"量化参数 - scale: {quantized_weight_all.q_scale():.6f}, zero_point: {quantized_weight_all.q_zero_point()}")

# 测试推理
test_input = torch.randn(1, 3)
output_fp32 = model(test_input)
output_int8 = model_int8(test_input)
print(f"\n浮点模型输出: {output_fp32}")
print(f"量化模型输出: {output_int8}")

原参数: tensor([[ 0.1234,  0.5678,  0.9012],
        [-0.2468, -0.1357,  0.3579]])
量化后(int8): tensor([[ 17,  80, 127],
        [-35, -19,  51]], dtype=torch.int8)
反量化后: tensor([[ 0.1202,  0.5655,  0.8977],
        [-0.2474, -0.1343,  0.3605]])
量化参数 - scale: 0.007068, zero_point: 0

浮点模型输出: tensor([[0.3380, 0.7414]], grad_fn=<MmBackward0>)
量化模型输出: tensor([[0.3537, 0.7434]])


### 3.2.2 对称静态量化代码实现

In [16]:
import torch
import torch.nn as nn
from torch.ao import quantization

# 1. 定义待量化模型（含量化/反量化节点）
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.quant = quantization.QuantStub()  # 输入FP32→INT8
        self.dequant = quantization.DeQuantStub()  # 输出INT8→FP32
        
        self.linear1 = nn.Linear(3, 4, bias=False)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(4, 2, bias=False)
        
        # 固定权重，便于观察量化效果
        self.linear1.weight.data = torch.tensor(
            [[0.1234, 0.5678, 0.9012],
             [-0.2468, -0.1357, 0.3579],
             [0.4680, 0.7890, -0.1011],
             [-0.3234, 0.6543, -0.9876]],
            dtype=torch.float32
        )
        self.linear2.weight.data = torch.tensor(
            [[0.1122, -0.3344, 0.5566, -0.7788],
             [0.2233, -0.4455, 0.6677, -0.8899]],
            dtype=torch.float32
        )
    
    def forward(self, x):
        x = self.quant(x)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.dequant(x)
        return x

# 2. 初始化浮点模型，获取基准输出
model_fp32 = SimpleModel()
model_fp32.eval()  # 量化仅支持eval模式
torch.manual_seed(42)  # 固定种子，结果可复现
test_input = torch.randn(1, 3)
output_fp32 = model_fp32(test_input)
print(f"浮点模型输出:\n{output_fp32}\n")

# 3. 量化核心步骤（权重按通道，激活按张量）
# 3.1 配置量化规则：fbgemm默认权重按通道量化，激活按张量量化
qconfig = quantization.get_default_qconfig("fbgemm")
model_fp32.qconfig = qconfig

# 3.2 准备校准：插入假量化节点，开启统计
model_prepared = quantization.prepare(model_fp32)

# 3.3 激活校准：统计激活分布（模拟10个样本）
calibration_data = [torch.randn(1, 3) for _ in range(10)]
with torch.no_grad():
    for data in calibration_data:
        model_prepared(data)

# 3.4 转换为INT8模型：权重按通道量化，激活按张量量化
model_int8 = quantization.convert(model_prepared)
model_int8.eval()

# 4. 量化参数分析
print("="*60)
# linear1权重（按通道量化：每个输出通道独立scale/zero_point）
quantized_linear1 = model_int8.linear1
quantized_weight1 = quantized_linear1.weight()

print(f"linear1 原始FP32权重:\n{model_fp32.linear1.weight.data}")
print(f"\nlinear1 按通道量化INT8权重:\n{quantized_weight1.int_repr()}")
print(f"\nlinear1 反量化FP32权重:\n{quantized_weight1.dequantize()}")

# 按通道量化参数
scales = quantized_weight1.q_per_channel_scales()
zero_points = quantized_weight1.q_per_channel_zero_points()
axis = quantized_weight1.q_per_channel_axis()
print(f"\nlinear1 按通道量化参数（{len(scales)}个通道）:")
for i in range(len(scales)):
    print(f"  通道{i} - scale: {scales[i].item():.6f}, zero_point: {zero_points[i].item()}")
print(f"  通道维度: axis={axis}（0=输出特征维度）")

# 激活量化参数（按张量量化：全激活张量共用1套scale/zero_point，无多通道独立分布特征，效率更高且精度足够）
quant_stub = model_int8.quant
act_scale = quant_stub.scale.item()
act_zero_point = quant_stub.zero_point.item()
print(f"\n激活量化参数（按张量） - scale: {act_scale:.6f}, zero_point: {act_zero_point}")
print("  说明：激活无多通道独立分布特征，按张量量化（全局单参数）效率更高，精度满足需求（区别于权重的按通道量化）")
print("="*60)

# 5. 推理与误差对比
output_int8 = model_int8(test_input)
print(f"\n按通道量化模型输出:\n{output_int8}")
quant_error = torch.norm(output_fp32 - output_int8).item()
print(f"\n量化误差（L2范数）: {quant_error:.6f}")

浮点模型输出:
tensor([[0.1677, 0.2300]], grad_fn=<MmBackward0>)

linear1 原始FP32权重:
tensor([[ 0.1234,  0.5678,  0.9012],
        [-0.2468, -0.1357,  0.3579],
        [ 0.4680,  0.7890, -0.1011],
        [-0.3234,  0.6543, -0.9876]])

linear1 按通道量化INT8权重:
tensor([[  17,   80,  127],
        [ -88,  -48,  127],
        [  76,  127,  -16],
        [ -42,   84, -128]], dtype=torch.int8)

linear1 反量化FP32权重:
tensor([[ 0.1202,  0.5655,  0.8977],
        [-0.2470, -0.1347,  0.3565],
        [ 0.4703,  0.7859, -0.0990],
        [-0.3253,  0.6507, -0.9915]])

linear1 按通道量化参数（4个通道）:
  通道0 - scale: 0.007068, zero_point: 0
  通道1 - scale: 0.002807, zero_point: 0
  通道2 - scale: 0.006188, zero_point: 0
  通道3 - scale: 0.007746, zero_point: 0
  通道维度: axis=0（0=输出特征维度）

激活量化参数（按张量） - scale: 0.030678, zero_point: 55
  说明：激活无多通道独立分布特征，按张量量化（全局单参数）效率更高，精度满足需求（区别于权重的按通道量化）

按通道量化模型输出:
tensor([[0.1640, 0.2343]])

量化误差（L2范数）: 0.005609


### 量化感知训练

In [17]:
import torch
import torch.nn as nn
from torch.ao import quantization

# 1. 定义待量化模型（含量化/反量化节点）
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.quant = quantization.QuantStub()  # 输入FP32→INT8
        self.dequant = quantization.DeQuantStub()  # 输出INT8→FP32
        
        self.linear1 = nn.Linear(3, 4, bias=False)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(4, 2, bias=False)
        
        # 固定初始权重，便于观察QAT效果
        self.linear1.weight.data = torch.tensor(
            [[0.1234, 0.5678, 0.9012],
             [-0.2468, -0.1357, 0.3579],
             [0.4680, 0.7890, -0.1011],
             [-0.3234, 0.6543, -0.9876]],
            dtype=torch.float32
        )
        self.linear2.weight.data = torch.tensor(
            [[0.1122, -0.3344, 0.5566, -0.7788],
             [0.2233, -0.4455, 0.6677, -0.8899]],
            dtype=torch.float32
        )
    
    def forward(self, x):
        x = self.quant(x)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.dequant(x)
        return x

# 2. 初始化模型 + 配置QAT量化规则
model = SimpleModel()
# QAT专用qconfig：训练时模拟量化误差，fbgemm适配x86，权重按通道、激活按张量
qconfig = quantization.get_default_qat_qconfig("fbgemm")
model.qconfig = qconfig

# 3. 准备量化感知训练（QAT）模型（核心：插入量化模拟节点，训练时感知量化误差）
model_prepared = quantization.prepare_qat(model)
model_prepared.train()  # QAT需要在训练模式下执行

# 4. 简单的训练循环（新手演示用，仅3轮训练）
optimizer = torch.optim.SGD(model_prepared.parameters(), lr=0.01)  # 优化器
loss_fn = nn.MSELoss()  # 损失函数（回归任务）
torch.manual_seed(42)   # 固定种子，结果可复现

# 模拟训练数据：10个样本，输入3维，标签2维（和模型输出匹配）
train_data = [torch.randn(1, 3) for _ in range(10)]
train_labels = [torch.randn(1, 2) for _ in range(10)]

# 训练循环（QAT核心：训练过程中让模型适应量化误差）
print("开始量化感知训练（QAT）...")
for epoch in range(3):
    total_loss = 0.0
    for x, y in zip(train_data, train_labels):
        optimizer.zero_grad()  # 清空梯度
        output = model_prepared(x)  # 前向传播（含量化模拟）
        loss = loss_fn(output, y)   # 计算损失
        loss.backward()             # 反向传播
        optimizer.step()            # 更新权重
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, 平均损失: {total_loss/len(train_data):.6f}")

# 5. 训练完成后，转换为真正的INT8量化模型
model_prepared.eval()  # 转换前切回推理模式
model_int8 = quantization.convert(model_prepared)

# 6. 测试量化模型效果
test_input = torch.randn(1, 3)
# 原始浮点模型（训练前）输出对比
model.eval()
output_fp32 = model(test_input)
# QAT量化模型输出
output_int8 = model_int8(test_input)

# 7. 量化参数分析
print("\n" + "="*60)
# linear1权重（按通道量化）
quantized_linear1 = model_int8.linear1
quantized_weight1 = quantized_linear1.weight()

print(f"linear1 按通道量化INT8权重:\n{quantized_weight1.int_repr()}")
# 按通道量化参数
scales = quantized_weight1.q_per_channel_scales()
zero_points = quantized_weight1.q_per_channel_zero_points()
print(f"\nlinear1 按通道量化参数（{len(scales)}个通道）:")
for i in range(len(scales)):
    print(f"  通道{i} - scale: {scales[i].item():.6f}, zero_point: {zero_points[i].item()}")

# 激活量化参数（按张量：全激活张量共用1套scale/zero_point，无多通道独立分布特征，效率更高且精度足够）
quant_stub = model_int8.quant
act_scale = quant_stub.scale.item()
act_zero_point = quant_stub.zero_point.item()
print(f"\n激活量化参数（按张量） - scale: {act_scale:.6f}, zero_point: {act_zero_point}")
print("="*60)

# 8. 结果对比
print(f"\n训练前浮点模型输出:\n{output_fp32}")
print(f"QAT量化模型输出:\n{output_int8}")
# 计算量化误差
quant_error = torch.norm(output_fp32 - output_int8).item()
print(f"QAT量化误差（L2范数）: {quant_error:.6f}")

开始量化感知训练（QAT）...
Epoch 1, 平均损失: 0.537143
Epoch 2, 平均损失: 0.536015
Epoch 3, 平均损失: 0.536302

linear1 按通道量化INT8权重:
tensor([[  18,   80,  127],
        [ -86,  -48,  127],
        [  76,  127,  -18],
        [ -42,   84, -128]], dtype=torch.int8)

linear1 按通道量化参数（4个通道）:
  通道0 - scale: 0.007069, zero_point: 0
  通道1 - scale: 0.002813, zero_point: 0
  通道2 - scale: 0.006188, zero_point: 0
  通道3 - scale: 0.007746, zero_point: 0

激活量化参数（按张量） - scale: 0.003848, zero_point: 0

训练前浮点模型输出:
tensor([[-0.1679, -0.1966]], grad_fn=<MmBackward0>)
QAT量化模型输出:
tensor([[0.0000, 0.0279]])
QAT量化误差（L2范数）: 0.280334


### LLM.int8()

In [2]:
# 1. 导入核心库 + 环境检查（适配你的conda+cu126+RTX4070）
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from bitsandbytes.optim import BitsAndBytesConfig

# 环境检查（修正CUDA版本查询，适配你的配置）
print("===== 环境检查 =====")
print(f"Torch版本：{torch.__version__}")
print(f"CUDA是否可用：{torch.cuda.is_available()}")
print(f"PyTorch编译的CUDA版本：{torch.version.cuda}")  # 修正你的错误写法
print(f"可用GPU数量：{torch.cuda.device_count()}")
print(f"GPU型号：{torch.cuda.get_device_name(0)}")
print(f"Transformers版本：{__import__('transformers').__version__}")  # 验证已装的4.52.1

# 强制指定RTX4070 GPU
device = "cuda:0"
if not torch.cuda.is_available():
    raise ValueError("❌ 检测到GPU但CUDA不可用，请确认显卡驱动/conda环境激活")

# 2. 配置LLM.int8量化参数（兼容transformers 4.52.1）
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # 开启LLM.int8量化核心
    load_in_8bit_device_map={"": 0},  # 强制加载到RTX4070
    bnb_4bit_compute_dtype=torch.float16,  # RTX4070加速
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    llm_int8_threshold=6.0  # LLM.int8核心阈值（适配移动端GPU）
)

# 3. 轻量级模型（RTX4070无压力）
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 4. 加载8bit量化模型（适配transformers 4.52.1）
print("\n===== 加载模型 =====")
print("加载8bit模型到RTX4070...（首次下载≈1.1GB，耐心等待）")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map=device,  # 强制GPU加载
    trust_remote_code=True,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True  # 减少conda环境的CPU内存占用
)

# 验证模型状态
print(f"\n===== 模型验证 =====")
print(f"模型设备：{next(model.parameters()).device}")  # 应输出cuda:0
print(f"是否8bit量化：{model.is_loaded_in_8bit}")  # 应输出True
print(f"GPU显存占用：{torch.cuda.memory_allocated(0)/1024/1024:.0f} MB")

# 5. 加载分词器（适配transformers 4.52.1）
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token  # 修复padding警告

# 6. 文本生成测试（RTX4070加速）
print("\n===== 生成测试 =====")
prompt = "请用一句话介绍人工智能"
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    padding=True,
    truncation=True
).to(device)

# 生成参数（兼容transformers 4.52.1）
outputs = model.generate(
    **inputs,
    max_new_tokens=80,
    temperature=0.6,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

# 输出结果
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"输入：{prompt}")
print(f"输出：{response}")

# 清理显存（conda环境可选）
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm
Could not find the bitsandbytes CUDA binary at WindowsPath('D:/Anaconda/envs/deeplearning/lib/site-packages/bitsandbytes/libbitsandbytes_cuda126.dll')
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


ImportError: cannot import name 'BitsAndBytesConfig' from 'bitsandbytes.optim' (D:\Anaconda\envs\deeplearning\lib\site-packages\bitsandbytes\optim\__init__.py)