# Test vLLM Model Loading

### 1. With Tensorizer

In [1]:
import multiprocessing
import os
import shutil
import time
import traceback
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from tensorizer import stream_io

# stream_io._ensure_https_endpoint = lambda x: x


def load_tensorize(queue):
    import logging
    logging.basicConfig(level=logging.INFO)

    try:
        t1 = time.perf_counter()
        from vllm import AsyncEngineArgs, AsyncLLMEngine
        logging.info("Initializing AsyncEngineArgs")
        ENGINE_ARGS = AsyncEngineArgs(
            load_format="tensorizer",
            model="EleutherAI/gpt-j-6B",
            dtype="float32",
            model_loader_extra_config=TensorizerConfig(
                tensorizer_uri="s3://tensorized/EleutherAI/gpt-j-6B/model.tensors",
                num_readers=8,
                s3_endpoint="https://accel-object.ord1.coreweave.com",
            )
        )

        # ENGINE_ARGS = AsyncEngineArgs(
        #     load_format="tensorizer",
        #     model="meta-llama/Meta-Llama-3-8B-Instruct",
        #     model_loader_extra_config=TensorizerConfig(
        #         tensorizer_uri="s3://bentoml-s3-store/vllm/meta-llama/Meta-Llama-3-8B-Instruct/v1/model.tensors",
        #         num_readers=4,
        #         s3_endpoint="http://35.184.72.229:9000",
        #         s3_access_key_id="ViXQIKesbLbo3VEq8gYt",
        #         s3_secret_access_key="QOJS4ClovqsI7tyMvmI83xiopYrt8CUv7vd0567X",
        #     )
        # )

        logging.info("Initializing AsyncLLMEngine")
        engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)

        t2 = time.perf_counter()
        elapsed_time = t2 - t1
        queue.put(elapsed_time)
        logging.info("Putting elapsed time in queue")
    except Exception as e:
        traceback.print_exc()
        logging.error(f"Error in load_tensorize: {e}")
    finally:
        logging.info("Exiting process")
        os._exit(0)

if __name__ == "__main__":
    elapsed_times = []

    for i in range(3):
        queue = multiprocessing.Queue()
        process = multiprocessing.Process(target=load_tensorize, args=(queue,))
        process.start()
        process.join()

        if not queue.empty():
            elapsed_time = queue.get()
            elapsed_times.append(elapsed_time)
        else:
            print("Queue is empty, process might have failed.")

    for i, elapsed_time in enumerate(elapsed_times):
        print(f"Elapsed time {i + 1}: {elapsed_time:.2f} seconds")

    if elapsed_times:
        print(f"Average elapsed time: {sum(elapsed_times) / len(elapsed_times):.2f} seconds")
    else:
        print("No elapsed times recorded.")


INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


INFO 06-23 10:25:23 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='EleutherAI/gpt-j-6B', speculative_config=None, tokenizer='EleutherAI/gpt-j-6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float32, max_seq_len=2048, download_dir=None, load_format=LoadFormat.TENSORIZER, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=EleutherAI/gpt-j-6B)
INFO 06-23 10:25:23 selector.py:125] Cannot use FlashAttention-2 backend for dtype other than torch.float16 or torch.bfloat16.
INFO 06-23 10:25:23 selector.py:51] Using XFormers backend.
Downloading https://accel-object.ord1.coreweave.com/tensorized/EleutherAI/gpt-j-6B/model.tensors
INFO 06-23 10:25:25 selector.py:1

INFO:root:Putting elapsed time in queue
INFO:root:Exiting process
INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


INFO 06-23 10:26:00 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='EleutherAI/gpt-j-6B', speculative_config=None, tokenizer='EleutherAI/gpt-j-6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float32, max_seq_len=2048, download_dir=None, load_format=LoadFormat.TENSORIZER, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=EleutherAI/gpt-j-6B)
INFO 06-23 10:26:00 selector.py:125] Cannot use FlashAttention-2 backend for dtype other than torch.float16 or torch.bfloat16.
INFO 06-23 10:26:00 selector.py:51] Using XFormers backend.
Downloading https://accel-object.ord1.coreweave.com/tensorized/EleutherAI/gpt-j-6B/model.tensors
INFO 06-23 10:26:02 selector.py:1

INFO:root:Putting elapsed time in queue
INFO:root:Exiting process
INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


INFO 06-23 10:26:35 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='EleutherAI/gpt-j-6B', speculative_config=None, tokenizer='EleutherAI/gpt-j-6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float32, max_seq_len=2048, download_dir=None, load_format=LoadFormat.TENSORIZER, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=EleutherAI/gpt-j-6B)
INFO 06-23 10:26:35 selector.py:125] Cannot use FlashAttention-2 backend for dtype other than torch.float16 or torch.bfloat16.
INFO 06-23 10:26:35 selector.py:51] Using XFormers backend.
Downloading https://accel-object.ord1.coreweave.com/tensorized/EleutherAI/gpt-j-6B/model.tensors
INFO 06-23 10:26:37 selector.py:1

INFO:root:Putting elapsed time in queue
INFO:root:Exiting process


Elapsed time 1: 35.19 seconds
Elapsed time 2: 33.38 seconds
Elapsed time 3: 34.08 seconds
Average elapsed time: 34.22 seconds


### 2. Without Tensorizer

In [2]:
import multiprocessing
import os
import shutil
import time

def clear_cache():
    folder_path = os.path.expanduser(
        "~/.cache/huggingface/hub/"
    )

    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        print(f"Folder '{folder_path}' has been deleted.")
    else:
        print(f"Folder '{folder_path}' does not exist.")

def load_non_tensorize(queue):
    import logging
    logging.basicConfig(level=logging.INFO)

    try:
        t1 = time.perf_counter()
        from vllm import AsyncEngineArgs, AsyncLLMEngine

        logging.info("Initializing AsyncEngineArgs")
        ENGINE_ARGS = AsyncEngineArgs(
            model="EleutherAI/gpt-j-6B",
            dtype="float32",
        )

        logging.info("Initializing AsyncLLMEngine")
        engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)

        t2 = time.perf_counter()
        elapsed_time = t2 - t1
        queue.put(elapsed_time)
        logging.info("Putting elapsed time in queue")
    except Exception as e:
        logging.error(f"Error in load_non_tensorize: {e}")
    finally:
        logging.info("Exiting process")
        os._exit(0)

if __name__ == "__main__":
    elapsed_times = []

    for i in range(2):
        queue = multiprocessing.Queue()
        clear_cache()
        process = multiprocessing.Process(target=load_non_tensorize, args=(queue,))
        process.start()
        process.join()

        if not queue.empty():
            elapsed_time = queue.get()
            elapsed_times.append(elapsed_time)
        else:
            print("Queue is empty, process might have failed.")

    for i, elapsed_time in enumerate(elapsed_times):
        print(f"Elapsed time {i + 1}: {elapsed_time:.2f} seconds")

    if elapsed_times:
        print(f"Average elapsed time: {sum(elapsed_times) / len(elapsed_times):.2f} seconds")
    else:
        print("No elapsed times recorded.")


Folder '/home/paperspace/.cache/huggingface/hub/' has been deleted.


INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

INFO 06-23 10:28:41 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='EleutherAI/gpt-j-6B', speculative_config=None, tokenizer='EleutherAI/gpt-j-6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float32, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=EleutherAI/gpt-j-6B)


tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

INFO 06-23 10:28:43 selector.py:125] Cannot use FlashAttention-2 backend for dtype other than torch.float16 or torch.bfloat16.
INFO 06-23 10:28:43 selector.py:51] Using XFormers backend.
INFO 06-23 10:28:44 selector.py:125] Cannot use FlashAttention-2 backend for dtype other than torch.float16 or torch.bfloat16.
INFO 06-23 10:28:44 selector.py:51] Using XFormers backend.
INFO 06-23 10:28:45 weight_utils.py:207] Using model weights format ['*.bin']


pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

INFO 06-23 10:30:18 model_runner.py:146] Loading model weights took 22.5428 GB
INFO 06-23 10:30:20 gpu_executor.py:83] # GPU blocks: 3503, # CPU blocks: 292
INFO 06-23 10:30:23 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-23 10:30:23 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-23 10:30:31 model_runner.py:924] Graph capturing finished in 9 secs.


INFO:root:Putting elapsed time in queue
INFO:root:Exiting process


Folder '/home/paperspace/.cache/huggingface/hub/' has been deleted.


INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

INFO 06-23 10:30:36 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='EleutherAI/gpt-j-6B', speculative_config=None, tokenizer='EleutherAI/gpt-j-6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float32, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=EleutherAI/gpt-j-6B)


tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

INFO 06-23 10:30:37 selector.py:125] Cannot use FlashAttention-2 backend for dtype other than torch.float16 or torch.bfloat16.
INFO 06-23 10:30:37 selector.py:51] Using XFormers backend.
INFO 06-23 10:30:38 selector.py:125] Cannot use FlashAttention-2 backend for dtype other than torch.float16 or torch.bfloat16.
INFO 06-23 10:30:38 selector.py:51] Using XFormers backend.
INFO 06-23 10:30:39 weight_utils.py:207] Using model weights format ['*.bin']


pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

INFO 06-23 10:32:11 model_runner.py:146] Loading model weights took 22.5428 GB
INFO 06-23 10:32:13 gpu_executor.py:83] # GPU blocks: 3503, # CPU blocks: 292
INFO 06-23 10:32:16 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-23 10:32:16 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-23 10:32:25 model_runner.py:924] Graph capturing finished in 9 secs.


INFO:root:Putting elapsed time in queue
INFO:root:Exiting process


Elapsed time 1: 110.02 seconds
Elapsed time 2: 109.17 seconds
Average elapsed time: 109.60 seconds


# Test Raw Huggingface AutoModelForCausalLM

### 1. With Tensorizer

In [1]:
import time
import torch
from tensorizer import TensorDeserializer
from tensorizer.utils import no_init_or_tensor, convert_bytes, get_mem_usage

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

model_ref = "EleutherAI/gpt-j-6B"
s3_uri = "s3://tensorized/EleutherAI/gpt-j-6B/model.tensors"

config = AutoConfig.from_pretrained(model_ref)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# This ensures that the pretrained model weights are not initialized,
# and non-persistent buffers (generated at runtime) are on the correct device.

start = time.perf_counter()
with torch.device(device), no_init_or_tensor():
    model = AutoModelForCausalLM.from_config(config)

print(f"Deserializing to {device}:")
before_mem = get_mem_usage()

# Lazy load the tensors from S3 into the model.
deserializer = TensorDeserializer(s3_uri, device=device, num_readers=8)
deserializer.load_into_module(model)
end = time.perf_counter()

after_mem = get_mem_usage()

# Brag about how fast we are.
total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
duration = end - start
per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
deserializer.close()
print(f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s")

# Tokenize and generate
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_ref)
eos = tokenizer.eos_token_id
input_ids = tokenizer.encode(
    "I have a dream that one day", return_tensors="pt"
).to(device)

with torch.no_grad():
    output = model.generate(
        input_ids, max_new_tokens=50, do_sample=True, pad_token_id=eos
    )

print(f"Output: {tokenizer.decode(output[0], skip_special_tokens=True)}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]



Deserializing to cuda:
Downloading https://accel-object.ord1.coreweave.com/tensorized/EleutherAI/gpt-j-6B/model.tensors
Deserialized 24.3 GB in 16.11s, 1.5 GB/s




Output: I have a dream that one day, Moms and Dads will make some more peace in this world.

Moms and Dads

I have such a sweet family... and they ALL were awesome today. Even the kids got along and weren't throwing fits (except


### 2. Without Tensorizer

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_ref = "EleutherAI/gpt-j-6B"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

start = time.perf_counter()
model = AutoModelForCausalLM.from_pretrained(model_ref, device_map=device)
end = time.perf_counter()

total_bytes_str = convert_bytes(model.num_parameters() * 4)
duration = end - start
per_second = convert_bytes(model.num_parameters() * 4 / duration)
deserializer.close()
print(f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s")

# Tokenize and generate
tokenizer = AutoTokenizer.from_pretrained(model_ref)
eos = tokenizer.eos_token_id
input_ids = tokenizer.encode(
    "I have a dream that one day ", return_tensors="pt"
).to(device)

with torch.no_grad():
    output = model.generate(
        input_ids, max_new_tokens=50, do_sample=True, pad_token_id=eos
    )

print(f"Output: {tokenizer.decode(output[0], skip_special_tokens=True)}")

pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Deserialized 24.2 GB in 79.38s, 304.9 MB/s
Output: I have a dream that one day  
they'll all be brothers.

But then I have a dream  
that I might be the first.

Who's gonna hold my hand,  
baby, and walk me over  
that bridge?
