In [None]:
#@title Requirements
%%writefile requirements.txt
onnx
tensorrt
vllm
lmdeploy
openvino

Writing requirements.txt


In [None]:
#@title Install Packages
%%capture
!pip install -r requirements.txt

# Quanti

In [None]:
# prompt: 기존 모델 실행 시간과 onnx qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import time
import torch
import onnx

# Load your original model
original_model = ...

# Quantize the model using ONNX
quantized_model = onnx.quantize.quantize_dynamic(
    original_model,
    weight_type=onnx.TensorProto.INT8
)

# Save the quantized model
onnx.save(quantized_model, "quantized_model.onnx")

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
# ...
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# Measure execution time for the quantized model
start_time = time.time()
# Run inference with the quantized model
# ...
end_time = time.time()
quantized_execution_time = end_time - start_time
print("Quantized model execution time:", quantized_execution_time)

# Compare execution times
speedup = original_execution_time / quantized_execution_time
print("Speedup:", speedup)


In [None]:
# prompt: 기존 모델 실행 시간과 tensorrt qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import time
import torch
import onnx
import tensorrt as trt

# Load your original model
original_model = ...

# Quantize the model using ONNX
quantized_model = onnx.quantize.quantize_dynamic(
    original_model,
    weight_type=onnx.TensorProto.INT8
)

# Save the quantized model
onnx.save(quantized_model, "quantized_model.onnx")

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
# ...
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# TensorRT quantization and execution
logger = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

with open("quantized_model.onnx", "rb") as model:
    if not parser.parse(model.read()):
        for error in range(parser.num_errors):
            print(parser.get_error(error))

# Build TensorRT engine
engine = builder.build_cuda_engine(network)

# Measure execution time for the TensorRT quantized model
start_time = time.time()
# Run inference with the TensorRT engine
# ...
end_time = time.time()
trt_quantized_execution_time = end_time - start_time
print("TensorRT quantized model execution time:", trt_quantized_execution_time)

# Compare execution times
speedup_trt = original_execution_time / trt_quantized_execution_time
print("Speedup with TensorRT quantization:", speedup_trt)


In [None]:
# prompt: 기존 모델 실행 시간과 vllm qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import time
import torch
import onnx
import tensorrt as trt
from vllm import LLM, SamplingParams

# Load your original model
original_model = ...

# Quantize the model using ONNX
quantized_model = onnx.quantize.quantize_dynamic(
    original_model,
    weight_type=onnx.TensorProto.INT8
)

# Save the quantized model
onnx.save(quantized_model, "quantized_model.onnx")

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
# ...
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)


# vllm quantization and execution
# Initialize vLLM with the quantized model
llm = LLM(model="quantized_model.onnx")

# Generate text using vLLM
prompts = ["This is a prompt."]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
start_time = time.time()
result = llm.generate(prompts, sampling_params)
end_time = time.time()
vllm_quantized_execution_time = end_time - start_time
print("vLLM quantized model execution time:", vllm_quantized_execution_time)

# Compare execution times
speedup_vllm = original_execution_time / vllm_quantized_execution_time
print("Speedup with vLLM quantization:", speedup_vllm)


In [None]:
# prompt: 기존 모델 실행 시간과 lmdeploy qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드
# 단 lmdeploy는 cli가 아닌 lmdeploy 패키지 import 를 통해 실행하여야 함

import time
import torch
import onnx
import tensorrt as trt
from vllm import LLM, SamplingParams
import lmdeploy

# Load your original model
original_model = ...

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
# ...
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# lmdeploy quantization
from lmdeploy.turbomind import TurboMind
turbomind = TurboMind()
quantized_model = turbomind.quantize(original_model)

# Measure execution time for the lmdeploy quantized model
start_time = time.time()
# Run inference with the quantized model
# ...
end_time = time.time()
lmdeploy_quantized_execution_time = end_time - start_time
print("lmdeploy quantized model execution time:", lmdeploy_quantized_execution_time)

# Compare execution times
speedup_lmdeploy = original_execution_time / lmdeploy_quantized_execution_time
print("Speedup with lmdeploy quantization:", speedup_lmdeploy)


In [None]:
# prompt: 기존 모델 실행 시간과 openvino qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import openvino as ov

# Load your original model
original_model = ...

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
# ...
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# Load the ONNX model
core = ov.Core()
model = core.read_model("path/to/save/model.onnx")

# Quantize the model
quantized_model = ov.quantize(model, {}, "path/to/save/quantized_model.xml")

# Save the quantized model
ov.serialize(quantized_model, "path/to/save/quantized_model.xml")

# Load the quantized model
compiled_model_quantized = core.compile_model("path/to/save/quantized_model.xml", "CPU")

# Measure execution time for the OpenVINO quantized model
start_time = time.time()
# Run inference with the OpenVINO quantized model
# ...
end_time = time.time()
openvino_quantized_execution_time = end_time - start_time
print("OpenVINO quantized model execution time:", openvino_quantized_execution_time)

# Compare execution times
speedup_openvino = original_execution_time / openvino_quantized_execution_time
print("Speedup with OpenVINO quantization:", speedup_openvino)
