# Installation

In [None]:
#@title Requirements
%%writefile requirements.txt
onnx
onnxruntime
tensorrt
vllm
lmdeploy
openvino

Overwriting requirements.txt


In [None]:
#@title Install Packages
%%capture
!pip install -r requirements.txt

# Select Model

In [1]:
#@title Select Language Model
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


base_model_id = "Gunulhona/tb_pretrained_sts" # @param ["Gunulhona/tb_pretrained_sts", "Gunulhona/tb_pretrained", "google/flan-t5-xxl", "meta-llama/Meta-Llama-3-8B", "meta-llama/Meta-Llama-3-70B-Instruct", "mistralai/Mistral-7B-Instruct-v0.3", "Qwen/Qwen2-7B-Instruct", "google/gemma-7b", "MLP-KTLim/llama-3-Korean-Bllossom-8B", "EleutherAI/polyglot-ko-12.8b", "vilm/vulture-40b", "arcee-ai/Arcee-Spark", "Qwen/Qwen2-1.5B-Instruct", "OuteAI/Lite-Mistral-150M", "google/gemma-2b-it"] {allow-input: true}

origin_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    # quantization_config=bnb_config
    )

processor = AutoTokenizer.from_pretrained(
    base_model_id,
    add_special_tokens=True,
    trust_remote_code=True)
processor.model_input_names=['input_ids', 'attention_mask']
if processor.pad_token is None:
    processor.pad_token = processor.eos_token

processor.padding_side = "right"
processor.truncation_side = "right"

@torch.no_grad()
def inference(input_, model):
    model.eval()
    inputs = processor(input_,
                       return_tensors="pt",
                    #    padding="max_length",
                    #    truncation=True,
                    #    max_length=128
                       )
    outputs = model.generate(**inputs)
    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BartForCausalLM were not initialized from the model checkpoint at Gunulhona/tb_pretrained_sts and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#@title Select Vision Model

from transformers import AutoProcessor, Owlv2ForObjectDetection, Owlv2VisionConfig
import torch

base_model_id = "google/owlv2-base-patch16-ensemble" # @param ["google/owlv2-base-patch16-ensemble", ""] {allow-input: true}

processor = AutoProcessor.from_pretrained(base_model_id)
origin_model = Owlv2ForObjectDetection.from_pretrained(base_model_id)

@torch.no_grad()
def inference(input_, model):
    model.eval()
    inputs = processor(input_, return_tensors="pt")
    outputs = model(**inputs)
    return outputs

# Quantize Model


In [57]:
# prompt: 기존 모델 실행 시간과 onnx qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import time
import numpy as np
import torch
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType


# Load your original model
original_model = origin_model
test_in = torch.randint(1000, (1, 128))
torch.onnx.export(original_model,
                  args={
                      "input_ids": test_in,
                      "decoder_input_ids": test_in
                      },
                  f="original_model.pt")


# Quantize the model using ONNX
quantized_model = quantize_dynamic(
    model_input="original_model.pt",
    model_output="quantized_model.onnx",
    weight_type=onnx.TensorProto.INT4
)

# Save the quantized model
# onnx.save(quantized_model, "quantized_model.onnx")

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
for _ in range(10):
    outputs = inference(input_="test 입력 처리 요구",
                        model=origin_model)
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# Measure execution time for the quantized model
start_time = time.time()
# Run inference with the quantized model
for _ in range(10):
    preprocessed = processor("test 입력 처리 요구", return_tensors="np", padding="max_length", truncation=True, max_length=128)
    onnx_input= {
        #  "input.1": np.array([1]).astype(np.int64),
         "input.1": preprocessed["input_ids"].astype(np.int64),
        #  "input.2": preprocessed["attention_mask"].astype(np.int64)
     }
    q_session = ort.InferenceSession("quantized_model.onnx", providers=["CPUExecutionProvider"])
    q_session.run(None, input_feed=onnx_input)
end_time = time.time()
quantized_execution_time = end_time - start_time
print("Quantized model execution time:", quantized_execution_time)

# Compare execution times
speedup = original_execution_time / quantized_execution_time
print("Speedup:", speedup)




Original model execution time: 6.747868299484253
Quantized model execution time: 6.006300687789917
Speedup: 1.123464949598986


In [None]:
# prompt: 기존 모델 실행 시간과 tensorrt qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import time
import torch
import onnx
import tensorrt as trt

# Load your original model
original_model = origin_model


# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
for _ in range(10):
    outputs = inference(input_="test 입력 처리 요구",
                        model=origin_model)
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# TensorRT quantization and execution
logger = trt.Logger(trt.Logger.INFO)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

with open("quantized_model.onnx", "rb") as model:
    if not parser.parse(model.read()):
        for error in range(parser.num_errors):
            print(parser.get_error(error))

# Build TensorRT engine
engine = builder.build_cuda_engine(network)

# Measure execution time for the TensorRT quantized model
start_time = time.time()
# Run inference with the TensorRT engine
for _ in range(10):
    preprocessed = processor("test 입력 처리 요구",
                             return_tensors="np",
                             padding="max_length",
                             truncation=True,
                             max_length=128)
    inputs[0].host = preprocessed["input_ids"].astype(np.float32)  # Assuming input_ids is the input tensor name
    trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
end_time = time.time()
trt_quantized_execution_time = end_time - start_time
print("TensorRT quantized model execution time:", trt_quantized_execution_time)

# Compare execution times
speedup_trt = original_execution_time / trt_quantized_execution_time
print("Speedup with TensorRT quantization:", speedup_trt)




In [7]:
# prompt: 기존 모델 실행 시간과 vllm qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import time
import torch
import onnx
import tensorrt as trt
from vllm import LLM, SamplingParams

# Load your original model
original_model = origin_model
torch.save(original_model.model.state_dict(), "original_model.pth")
# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
for _ in range(10):
    outputs = inference(input_="test 입력 처리 요구",
                        model=origin_model)
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)


# vllm quantization and execution
# Initialize vLLM with the quantized model
llm = LLM(model="original_model.pth")

# Generate text using vLLM
prompts = ["This is a prompt."]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
start_time = time.time()
for _ in range(10):
    result = llm.generate(prompts, sampling_params)
end_time = time.time()
vllm_quantized_execution_time = end_time - start_time
print("vLLM quantized model execution time:", vllm_quantized_execution_time)

# Compare execution times
speedup_vllm = original_execution_time / vllm_quantized_execution_time
print("Speedup with vLLM quantization:", speedup_vllm)




Original model execution time: 9.024619102478027


OSError: It looks like the config file at 'original_model.pth' is not a valid JSON file.

In [None]:
# prompt: 기존 모델 실행 시간과 lmdeploy qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드
# 단 lmdeploy는 cli가 아닌 lmdeploy 패키지 import 를 통해 실행하여야 함

import time
import torch
import onnx
import tensorrt as trt
from vllm import LLM, SamplingParams
import lmdeploy

# Load your original model
original_model = ...

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
# ...
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# lmdeploy quantization
from lmdeploy.turbomind import TurboMind
turbomind = TurboMind()
quantized_model = turbomind.quantize(original_model)

# Measure execution time for the lmdeploy quantized model
start_time = time.time()
# Run inference with the quantized model
# ...
end_time = time.time()
lmdeploy_quantized_execution_time = end_time - start_time
print("lmdeploy quantized model execution time:", lmdeploy_quantized_execution_time)

# Compare execution times
speedup_lmdeploy = original_execution_time / lmdeploy_quantized_execution_time
print("Speedup with lmdeploy quantization:", speedup_lmdeploy)


In [None]:
# prompt: 기존 모델 실행 시간과 openvino qunatization 실행 및 quantized 모델의 실행 시간 비교 하는 코드

import openvino as ov

# Load your original model
original_model = ...

# Measure execution time for the original model
start_time = time.time()
# Run inference with the original model
# ...
end_time = time.time()
original_execution_time = end_time - start_time
print("Original model execution time:", original_execution_time)

# Load the ONNX model
core = ov.Core()
model = core.read_model("path/to/save/model.onnx")

# Quantize the model
quantized_model = ov.quantize(model, {}, "path/to/save/quantized_model.xml")

# Save the quantized model
ov.serialize(quantized_model, "path/to/save/quantized_model.xml")

# Load the quantized model
compiled_model_quantized = core.compile_model("path/to/save/quantized_model.xml", "CPU")

# Measure execution time for the OpenVINO quantized model
start_time = time.time()
# Run inference with the OpenVINO quantized model
# ...
end_time = time.time()
openvino_quantized_execution_time = end_time - start_time
print("OpenVINO quantized model execution time:", openvino_quantized_execution_time)

# Compare execution times
speedup_openvino = original_execution_time / openvino_quantized_execution_time
print("Speedup with OpenVINO quantization:", speedup_openvino)
