# Installation

In [36]:
#@title Requirements
%%writefile requirements.txt
onnx
onnxruntime
onnx2pytorch
tensorrt
vllm
lmdeploy
openvino
optimum-intel[extras]

Overwriting requirements.txt


In [37]:
#@title Install Packages
%%capture
!pip install -r requirements.txt

# Select Model

In [27]:
#@title Select Language Model
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


base_model_id = "Gunulhona/tb_pretrained_sts" # @param ["Gunulhona/tb_pretrained_sts", "Gunulhona/tb_pretrained", "google/flan-t5-xxl", "meta-llama/Meta-Llama-3-8B", "meta-llama/Meta-Llama-3-70B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3", "Qwen/Qwen2-7B-Instruct", "google/gemma-7b", "MLP-KTLim/llama-3-Korean-Bllossom-8B", "EleutherAI/polyglot-ko-12.8b", "vilm/vulture-40b", "arcee-ai/Arcee-Spark", "Qwen/Qwen2-1.5B-Instruct", "OuteAI/Lite-Mistral-150M", "google/gemma-2b-it"] {allow-input: true}

origin_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    # quantization_config=bnb_config
    )

processor = AutoTokenizer.from_pretrained(
    base_model_id,
    add_special_tokens=True,
    trust_remote_code=True)
processor.model_input_names=['input_ids', 'attention_mask']
if processor.pad_token is None:
    processor.pad_token = processor.eos_token

processor.padding_side = "right"
processor.truncation_side = "right"

@torch.no_grad()
def inference(input_, model):
    model.eval()
    inputs = processor(input_,
                       return_tensors="pt",
                       padding="max_length",
                       truncation=True,
                       max_length=128)
    outputs = model.generate(**inputs,
                             max_new_tokens=50)
    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

Some weights of BartForCausalLM were not initialized from the model checkpoint at Gunulhona/tb_pretrained_sts and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#@title Select Vision Model

from transformers import AutoProcessor, Owlv2ForObjectDetection, Owlv2VisionConfig
import torch

base_model_id = "google/owlv2-base-patch16-ensemble" # @param ["google/owlv2-base-patch16-ensemble", ""] {allow-input: true}

processor = AutoProcessor.from_pretrained(base_model_id)
origin_model = Owlv2ForObjectDetection.from_pretrained(base_model_id)

@torch.no_grad()
def inference(input_, model):
    model.eval()
    inputs = processor(input_, return_tensors="pt")
    outputs = model(**inputs)
    return outputs

# Quantize Model


In [57]:
# prompt: 기존 모델 실행 시간과 onnx qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import time
import numpy as np
import torch
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType


# Load your original model
original_model = origin_model
test_in = torch.randint(1000, (1, 128))
torch.onnx.export(original_model,
                  args={
                      "input_ids": test_in,
                      "decoder_input_ids": test_in
                      },
                  f="original_model.pt")


# Quantize the model using ONNX
quantized_model = quantize_dynamic(
    model_input="original_model.pt",
    model_output="quantized_model.onnx",
    weight_type=onnx.TensorProto.INT4
)

# Save the quantized model
# onnx.save(quantized_model, "quantized_model.onnx")

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
for _ in range(10):
    outputs = inference(input_="test 입력 처리 요구",
                        model=origin_model)
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# Measure execution time for the quantized model
start_time = time.time()
# Run inference with the quantized model
for _ in range(10):
    preprocessed = processor("test 입력 처리 요구", return_tensors="np", padding="max_length", truncation=True, max_length=128)
    onnx_input= {
        #  "input.1": np.array([1]).astype(np.int64),
         "input.1": preprocessed["input_ids"].astype(np.int64),
        #  "input.2": preprocessed["attention_mask"].astype(np.int64)
     }
    q_session = ort.InferenceSession("quantized_model.onnx", providers=["CPUExecutionProvider"])
    q_session.run(None, input_feed=onnx_input)
end_time = time.time()
quantized_execution_time = end_time - start_time
print("Quantized model execution time:", quantized_execution_time)

# Compare execution times
speedup = original_execution_time / quantized_execution_time
print("Speedup:", speedup)




Original model execution time: 6.747868299484253
Quantized model execution time: 6.006300687789917
Speedup: 1.123464949598986


In [None]:
# prompt: 기존 모델 실행 시간과 tensorrt qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import time
import torch
import onnx
import tensorrt as trt

# Load your original model
original_model = origin_model

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
for _ in range(10):
    outputs = inference(input_="test 입력 처리 요구",
                        model=origin_model)
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# TensorRT quantization and execution
logger = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

with open("quantized_model.onnx", "rb") as model:
    if not parser.parse(model.read()):
        for error in range(parser.num_errors):
            print(parser.get_error(error))

# Build TensorRT engine
engine = builder.build_cuda_engine(network)

# Measure execution time for the TensorRT quantized model
start_time = time.time()
# Run inference with the TensorRT engine
for _ in range(10):
    preprocessed = processor("test 입력 처리 요구",
                             return_tensors="np",
                             padding="max_length",
                             truncation=True,
                             max_length=128)
    inputs[0].host = preprocessed["input_ids"].astype(np.float32)  # Assuming input_ids is the input tensor name
    trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
end_time = time.time()
trt_quantized_execution_time = end_time - start_time
print("TensorRT quantized model execution time:", trt_quantized_execution_time)

# Compare execution times
speedup_trt = original_execution_time / trt_quantized_execution_time
print("Speedup with TensorRT quantization:", speedup_trt)




In [8]:
# prompt: 기존 모델 실행 시간과 vllm qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import time
import torch
import onnx
import tensorrt as trt
from vllm import LLM, SamplingParams

# Load your original model
original_model = origin_model

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
for _ in range(10):
    outputs = inference(input_="test 입력 처리 요구",
                        model=origin_model)
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# vllm quantization and execution
# Initialize vLLM with the quantized model
llm = LLM(model=base_model_id)

# Generate text using vLLM
prompts = ["This is a prompt."]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
start_time = time.time()
for _ in range(10):
    result = llm.generate(prompts, sampling_params)
end_time = time.time()
vllm_quantized_execution_time = end_time - start_time
print("vLLM quantized model execution time:", vllm_quantized_execution_time)

# Compare execution times
speedup_vllm = original_execution_time / vllm_quantized_execution_time
print("Speedup with vLLM quantization:", speedup_vllm)




Original model execution time: 9.05189323425293
INFO 07-15 13:47:37 config.py:1350] Downcasting torch.float32 to torch.float16.
INFO 07-15 13:47:37 llm_engine.py:169] Initializing an LLM engine (v0.5.1) with config: model='Gunulhona/tb_pretrained_sts', speculative_config=None, tokenizer='Gunulhona/tb_pretrained_sts', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1026, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=Gunulhona/tb_pretrained_sts, use_v2_block_manager=False, enable_prefix_caching=False)


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [20]:
# prompt: 기존 모델 실행 시간과 lmdeploy qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드
# 단 lmdeploy는 cli가 아닌 lmdeploy 패키지 import 를 통해 실행하여야 함

import time
import torch
import onnx
import lmdeploy
from onnx2pytorch import ConvertModel
from transformers import AutoConfig, AutoModelForCausalLM
from lmdeploy import turbomind as tm
from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig

# Load your original model
original_model = origin_model

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
for _ in range(10):
    outputs = inference(input_="test 입력 처리 요구",
                        model=origin_model)
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# lmdeploy quantization
onnx_model = onnx.load("quantized_model.onnx")


pytorch_model = ConvertModel(onnx_model)
config = AutoConfig.from_pretrained("bart")  # 적절한 모델 구성으로 변경하세요
config.architectures = ["BartModel"]  # 모델 아키텍처에 맞게 조정하세요

hf_model = AutoModelForCausalLM.from_config(config)
hf_model.load_state_dict(pytorch_model.state_dict())
hf_model.save_pretrained("onnx_hf_model")

tm_model = tm.TurboMind(model_path="onnx_hf_model")

tm.quantize(
    base_model_id,
    "lmdeploy_quantized.tm",
    w_bits=4, w_group_size=128)
quantized_tm_model = tm.TurboMind(model_path=quantized_model_path)
chat = quantized_tm_model.create_chat_session()

# lmdeploy_pipe = pipeline(
#     base_model_id,
#     backend_config=TurbomindEngineConfig(model_format='hf', tp=4),
#     chat_template_config=ChatTemplateConfig(model_name='llama2'))

# Measure execution time for the lmdeploy quantized model
# Run inference with the quantized model
start_time = time.time()
# Run inference with the original model
for _ in range(10):
    response, _ = quantized_chat.chat("This is a prompt.")
end_time = time.time()
lmdeploy_quantized_execution_time = end_time - start_time
print("lmdeploy quantized model execution time:", lmdeploy_quantized_execution_time)

# Compare execution times
speedup_lmdeploy = original_execution_time / lmdeploy_quantized_execution_time
print("Speedup with lmdeploy quantization:", speedup_lmdeploy)




Original model execution time: 7.303006410598755


NotImplementedError: Conversion not implemented for op_type=DequantizeLinear.

In [44]:
# prompt: 기존 모델 실행 시간과 openvino qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import openvino as ov
from optimum.intel import OVModelForCausalLM

# Load your original model
original_model = origin_model

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
for _ in range(10):
    outputs = inference(input_="test 입력 처리 요구",
                        model=origin_model)
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# Load the ONNX model
core = ov.Core()
model = core.read_model("quantized_model.onnx")
model.reshape({model_input.any_name: ov.PartialShape([1, '?']) for model_input in model.inputs})

# Quantize the model
tput = {'PERFORMANCE_HINT': 'THROUGHPUT'}
compiled_model = core.compile_model(model, 'CPU', tput)
ireqs = ov.AsyncInferQueue(compiled_model)

ov_model = OVModelForCausalLM.from_pretrained(base_model_id, export=True)

# Save the quantized model
# ov.serialize(quantized_model, "quantized_model.xml")

# Load the quantized model

# Measure execution time for the OpenVINO quantized model
start_time = time.time()
# Run inference with the OpenVINO quantized model
for _ in range(10):
    preprocessed = processor("test 입력 처리 요구",
                             return_tensors="np",
                             padding="max_length",
                             truncation=True,
                             max_length=128)
    ireqs.start_async(preprocessed)
    # result = inference(input_="test 입력 처리 요구",
    #                    model=ov_model)
ireqs.wait_all()
end_time = time.time()
openvino_quantized_execution_time = end_time - start_time
print("OpenVINO quantized model execution time:", openvino_quantized_execution_time)

# Compare execution times
speedup_openvino = original_execution_time / openvino_quantized_execution_time
print("Speedup with OpenVINO quantization:", speedup_openvino)


Original model execution time: 29.218034744262695


Framework not specified. Using pt to export the model.
Some weights of BartForCausalLM were not initialized from the model checkpoint at Gunulhona/tb_pretrained_sts and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Non-default generation parameters: {'forced_eos_token_id': 2}
Using framework PyTorch: 2.3.0+cu121
Overriding 1 configuration item(s)
	- use_cache -> True
  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
Non-default generation parameters: {'forced_eos_token_id': 2}
Compiling the model to CPU ...


TypeError: Incompatible inputs of type: <class 'transformers.tokenization_utils_base.BatchEncoding'>