In [1]:
%%writefile requirements.txt
onnx
tensorrt
vllm
lmdeploy
openvino

Writing requirements.txt


In [None]:
%%capture
!pip install -r requirements.txt

In [None]:
# prompt: onnx 로 하는  quantization 코드

import onnx

# Load your ONNX model
model = onnx.load("path/to/your/model.onnx")

# Quantize the model
quantized_model = onnx.quantize(model, quantization_mode=onnx.QuantizationMode.IntegerOps)

# Save the quantized model
onnx.save(quantized_model, "path/to/save/quantized_model.onnx")


In [None]:
# prompt: tensorrt 로 하는 quantization 코드

import tensorrt as trt
import torch

# Load your PyTorch model
model = torch.load("path/to/your/model.pth")

# Convert the model to ONNX
dummy_input = torch.randn(1, 3, 224, 224)
torch.onnx.export(model, dummy_input, "path/to/save/model.onnx", verbose=True)

# Load the ONNX model
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
    with open("path/to/save/model.onnx", 'rb') as model:
        parser.parse(model.read())

    # Quantize the model
    builder.int8_mode = True
    builder.int8_calibrator = MyCalibrator()  # Replace with your own calibrator

    # Build the TensorRT engine
    engine = builder.build_cuda_engine(network)

    # Serialize the engine to a file
    with open("path/to/save/engine.trt", "wb") as f:
        f.write(engine.serialize())


In [None]:
# prompt: vllm 으로 하는 quantization 코드

from vllm import LLM
from vllm.quantization import quantize_model

# Load the original model
llm = LLM(model="path/to/your/model")

# Quantize the model
quantized_model_dir = "path/to/save/quantized_model"
quantize_model(
  model=llm.model,
  output_dir=quantized_model_dir,
  quantization_config={
    "bits": 4,  # Number of bits for quantization
    "group_size": -1,  # Group size for quantization
    "dtype": "int4",  # Data type for quantization
  },
)

# Load the quantized model
quantized_llm = LLM(model=quantized_model_dir)


In [None]:
# prompt: lmdeploy 로 하는 qunatization 코드

# Quantize the model using lmdeploy
!lmdeploy quantize \
  --model_path path/to/your/model \
  --output_path path/to/save/quantized_model \
  --quantization_config '{"bits": 4, "group_size": -1, "dtype": "int4"}'


In [None]:
# prompt: openvino 로 하는 quantization 코드
import openvino as ov

# Load the ONNX model
core = ov.Core()
model = core.read_model("path/to/save/model.onnx")

# Quantize the model
quantized_model = ov.quantize(model, {}, "path/to/save/quantized_model.xml")

# Save the quantized model
ov.serialize(quantized_model, "path/to/save/quantized_model.xml")
