In [1]:
from vllm.model_executor.models import T5ForVLLMWithCache

model_path = 'vllm/model_executor/models/t5-small-model'

tokenizer_path = 'vllm/model_executor/models/t5-small-tokenizer'

# Instantiate the model with KV Cache
model = T5ForVLLMWithCache(model_dir=model_path, tokenizer_dir=tokenizer_path)

# Test generating text
input_text = "Translate English to French: cheese"
output_text = model.generate(input_text)
print("Output Text:", output_text)



  from .autonotebook import tqdm as notebook_tqdm


INFO 04-30 02:21:00 pynccl.py:58] Loading nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1


2024-04-30 02:21:01,194	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Output Text: ['fromage']


In [3]:
from vllm.model_executor.models.t5 import T5ForVLLM

# Using absolute paths for clarity
model_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-model/'
tokenizer_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-tokenizer/'

# Initialize the model
model = T5ForVLLM(model_dir, tokenizer_dir)

# Test input
# test_input = "Translate English to French: How are you?"

test_input = "Translate English to French: Hi How are you, whats happening, tell me somthing new?"
output = model.generate(test_input)
print("Translation:", output)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Translation: ['Comment êtes-vous, ce qui se passe, dites-moi tellement de nouveauté?']


In [4]:
import timeit

# Setup model paths

# Using absolute paths for clarity
model_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-model'
tokenizer_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-tokenizer'

# Import model class
from vllm.model_executor.models import T5ForVLLM, T5ForVLLMWithCache, T5ForVLLMWithDistributedCache

# Initialize models
basic_model = T5ForVLLM(model_dir, tokenizer_dir)

# Test input
input_text = "Translate English to French: How are you?"

# Define function to measure latency
def test_basic_model():
    basic_model.generate(input_text)


# Measure latency
basic_latency = timeit.timeit(test_basic_model, number=10) / 10

print(f"Average Latency Basic Model: {basic_latency:.5f} seconds")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average Latency Basic Model: 0.29745 seconds


In [5]:
import timeit

# Setup model paths

# Using absolute paths for clarity
model_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-model'
tokenizer_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-tokenizer'

# Import model class
from vllm.model_executor.models import T5ForVLLM, T5ForVLLMWithCache

cached_model = T5ForVLLMWithCache(model_dir, tokenizer_dir)


def test_cached_model():
    cached_model.generate(input_text)
# Measure latency

cached_latency = timeit.timeit(test_cached_model, number=10) / 10

print(f"Average Latency Cached Model: {cached_latency:.5f} seconds")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average Latency Cached Model: 0.02662 seconds


In [11]:
# Import necessary libraries
import time
import numpy as np
import psutil
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Define the model paths
# Using absolute paths for clarity
model_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-model'
tokenizer_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-tokenizer'

# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained(tokenizer_dir)

# Load the basic and cached models
basic_model = T5ForConditionalGeneration.from_pretrained(model_dir)
cached_model = T5ForConditionalGeneration.from_pretrained(model_dir)  # Assuming cache logic is internal

# Define test input
input_text = "Translate English to French: How are you?"
encoded_input = tokenizer(input_text, return_tensors="pt")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def benchmark_model(model, input_ids, attention_mask, iterations=100):
    times = []
    cpu_usage = []
    mem_usage = []

    for _ in range(iterations):
        start_time = time.time()
        # Simulate model call
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        elapsed = time.time() - start_time

        # Collect metrics
        times.append(elapsed)
        cpu_usage.append(psutil.cpu_percent())
        mem_usage.append(psutil.virtual_memory().used)

    return times, cpu_usage, mem_usage


In [13]:
# Run benchmark for basic model
basic_times, basic_cpu, basic_mem = benchmark_model(basic_model, **encoded_input)

# Run benchmark for cached model
cached_times, cached_cpu, cached_mem = benchmark_model(cached_model, **encoded_input)

# Calculate and print metrics
print(f"Basic Model - Avg Time: {np.mean(basic_times):.3f} sec, CPU: {np.mean(basic_cpu):.2f}%, Mem: {np.mean(basic_mem)/1e6:.2f} MB")
print(f"Cached Model - Avg Time: {np.mean(cached_times):.3f} sec, CPU: {np.mean(cached_cpu):.2f}%, Mem: {np.mean(cached_mem)/1e6:.2f} MB")




Basic Model - Avg Time: 0.178 sec, CPU: 44.83%, Mem: 50617.86 MB
Cached Model - Avg Time: 0.164 sec, CPU: 45.20%, Mem: 50853.12 MB


In [5]:
# Percentiles for response time
percentiles = [50, 90, 99]
basic_p = np.percentile(basic_times, percentiles)
cached_p = np.percentile(cached_times, percentiles)

print("Response Time Percentiles (sec):")
print("P50, P90, P99")
for i, p in enumerate(percentiles):
    print(f"Basic Model - P{p}: {basic_p[i]:.3f}")
    print(f"Cached Model - P{p}: {cached_p[i]:.3f}")


Response Time Percentiles (sec):
P50, P90, P99
Basic Model - P50: 0.168
Cached Model - P50: 0.166
Basic Model - P90: 0.275
Cached Model - P90: 0.175
Basic Model - P99: 0.338
Cached Model - P99: 0.202


In [1]:
from vllm.model_executor.models.t5 import T5ForVLLM

# Using absolute paths for clarity
model_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-model/'
tokenizer_dir = '/data/data/vllm/vllm/model_executor/models/t5-small-tokenizer/'

# Initialize the model
model = T5ForVLLM(model_dir, tokenizer_dir)

# Test input
# test_input = "Translate English to French: How are you?"

test_input = "Translate English to French: Hi How are you, whats happening, tell me somthing new?"
output = model.generate(test_input)
print("Translation:", output)


  from .autonotebook import tqdm as notebook_tqdm


INFO 04-30 02:09:51 pynccl.py:58] Loading nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1


2024-04-30 02:09:51,503	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Translation: ['Comment êtes-vous, ce qui se passe, dites-moi tellement de nouveauté?']


In [1]:
from vllm.model_executor.models import T5ForVLLMWithDistributedCache
import torch
# Using absolute paths for clarity
model_path = '/data/data/vllm/vllm/model_executor/models/t5-small-model/'
tokenizer_path = '/data/data/vllm/vllm/model_executor/models/t5-small-tokenizer/'

# Instantiate the model with distributed KV Cache
model = T5ForVLLMWithDistributedCache(model_dir=model_path, tokenizer_dir=tokenizer_path)

# Test generating text
input_text = "Translate English to French: cheese"
output_text = model.generate(input_text)
print("Output Text:", output_text)


  from .autonotebook import tqdm as notebook_tqdm


INFO 04-30 02:18:34 pynccl.py:58] Loading nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1


2024-04-30 02:18:34,852	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Output Text: ['fromage']
