In [1]:
import multiprocessing
import os
import shutil
import time

def clear_cache():
    folder_path = os.path.expanduser(
        "~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/"
    )

    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        print(f"Folder '{folder_path}' has been deleted.")
    else:
        print(f"Folder '{folder_path}' does not exist.")

def load_non_tensorize(queue):
    import logging
    logging.basicConfig(level=logging.INFO)

    try:
        t1 = time.perf_counter()
        from vllm import AsyncEngineArgs, AsyncLLMEngine

        logging.info("Initializing AsyncEngineArgs")
        ENGINE_ARGS = AsyncEngineArgs(
            model="meta-llama/Meta-Llama-3-8B-Instruct",
        )

        logging.info("Initializing AsyncLLMEngine")
        engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)

        t2 = time.perf_counter()
        elapsed_time = t2 - t1
        queue.put(elapsed_time)
        logging.info("Putting elapsed time in queue")
    except Exception as e:
        logging.error(f"Error in load_non_tensorize: {e}")
    finally:
        logging.info("Exiting process")
        os._exit(0)

if __name__ == "__main__":
    elapsed_times = []

    for i in range(3):
        queue = multiprocessing.Queue()
        clear_cache()
        process = multiprocessing.Process(target=load_non_tensorize, args=(queue,))
        process.start()
        process.join()

        if not queue.empty():
            elapsed_time = queue.get()
            elapsed_times.append(elapsed_time)
        else:
            print("Queue is empty, process might have failed.")

    for i, elapsed_time in enumerate(elapsed_times):
        print(f"Elapsed time {i + 1}: {elapsed_time:.2f} seconds")

    if elapsed_times:
        print(f"Average elapsed time: {sum(elapsed_times) / len(elapsed_times):.2f} seconds")
    else:
        print("No elapsed times recorded.")


Folder '/home/paperspace/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/' does not exist.


INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

INFO 06-22 02:55:10 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

INFO 06-22 02:55:12 weight_utils.py:207] Using model weights format ['*.safetensors']


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

INFO 06-22 02:55:39 model_runner.py:146] Loading model weights took 14.9595 GB
INFO 06-22 02:55:40 gpu_executor.py:83] # GPU blocks: 27975, # CPU blocks: 2048
INFO 06-22 02:55:42 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-22 02:55:42 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-22 02:55:46 model_runner.py:924] Graph capturing finished in 4 secs.


INFO:root:Putting elapsed time in queue
INFO:root:Exiting process


Folder '/home/paperspace/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/' has been deleted.


INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

INFO 06-22 02:55:51 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

INFO 06-22 02:55:53 weight_utils.py:207] Using model weights format ['*.safetensors']


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

INFO 06-22 02:56:19 model_runner.py:146] Loading model weights took 14.9595 GB
INFO 06-22 02:56:20 gpu_executor.py:83] # GPU blocks: 27975, # CPU blocks: 2048
INFO 06-22 02:56:22 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-22 02:56:22 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-22 02:56:26 model_runner.py:924] Graph capturing finished in 4 secs.


INFO:root:Putting elapsed time in queue
INFO:root:Exiting process


Folder '/home/paperspace/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/' has been deleted.


INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

INFO 06-22 02:56:31 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

INFO 06-22 02:56:33 weight_utils.py:207] Using model weights format ['*.safetensors']


model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

INFO 06-22 02:56:57 model_runner.py:146] Loading model weights took 14.9595 GB
INFO 06-22 02:56:58 gpu_executor.py:83] # GPU blocks: 27975, # CPU blocks: 2048
INFO 06-22 02:56:59 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-22 02:56:59 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-22 02:57:04 model_runner.py:924] Graph capturing finished in 4 secs.


INFO:root:Putting elapsed time in queue
INFO:root:Exiting process


Elapsed time 1: 39.10 seconds
Elapsed time 2: 36.56 seconds
Elapsed time 3: 34.46 seconds
Average elapsed time: 36.71 seconds


In [6]:
import multiprocessing
import os
import shutil
import time
import traceback
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from tensorizer import stream_io

stream_io._ensure_https_endpoint = lambda x: x


def load_tensorize(queue):
    import logging
    logging.basicConfig(level=logging.INFO)

    try:
        t1 = time.perf_counter()
        from vllm import AsyncEngineArgs, AsyncLLMEngine
        logging.info("Initializing AsyncEngineArgs")
        ENGINE_ARGS = AsyncEngineArgs(
            load_format="tensorizer",
            model="meta-llama/Meta-Llama-3-8B-Instruct",
            model_loader_extra_config=TensorizerConfig(
                tensorizer_uri="s3://bentoml-s3-store/vllm/meta-llama/Meta-Llama-3-8B-Instruct/v1/model.tensors",
                num_readers=8,
                s3_endpoint="http://127.0.0.1:9000",
                s3_access_key_id="otLXqhmx6GBR8nNxLwJx",
                s3_secret_access_key="THLw14CUoi9oNz12J31jUdO02amxzHtzNxVLnsP2",
            )
        )

        logging.info("Initializing AsyncLLMEngine")
        engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)

        t2 = time.perf_counter()
        elapsed_time = t2 - t1
        queue.put(elapsed_time)
        logging.info("Putting elapsed time in queue")
    except Exception as e:
                
        logging.error(f"Error in load_tensorize: {e}")
    finally:
        logging.info("Exiting process")
        os._exit(0)

if __name__ == "__main__":
    elapsed_times = []

    for i in range(3):
        queue = multiprocessing.Queue()
        process = multiprocessing.Process(target=load_tensorize, args=(queue,))
        process.start()
        process.join()

        if not queue.empty():
            elapsed_time = queue.get()
            elapsed_times.append(elapsed_time)
        else:
            print("Queue is empty, process might have failed.")

    for i, elapsed_time in enumerate(elapsed_times):
        print(f"Elapsed time {i + 1}: {elapsed_time:.2f} seconds")

    if elapsed_times:
        print(f"Average elapsed time: {sum(elapsed_times) / len(elapsed_times):.2f} seconds")
    else:
        print("No elapsed times recorded.")


INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


INFO 06-22 03:09:19 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.TENSORIZER, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-22 03:09:26 tensorizer.py:344] Deserialized 16.1 GB in 5.48s, 2.9 GB/s
INFO 06-22 03:09:26 tensorizer.py:346] Memory usage before: CPU: (maxrss: 1,592MiB F: 9,252MiB) GPU: (U: 814MiB F: 80,414MiB T: 81,228MiB) TORCH: (R: 22MiB/22MiB, A: 2MiB/10MiB)
INFO 06-22 03:09:26 tensorizer.py:347] Memory usage after: CPU: (maxrss: 6,457MiB F: 5,406MiB) GPU: (U: 16,172MiB F: 65,056MiB T: 81,228MiB) TORCH: (R: 15,374MiB/15,374MiB, A: 15,318MiB/15,321MiB)
INFO 06-22 03:09:26 model_runner.py:146] Loading model weights took 14.9595 GB
INFO 06-22 03:09:27 gpu_executor.py:83] # GPU blocks: 27954, # CPU blocks: 2048
INFO 06-22 03:09:28 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-22 03:09:28 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decrea

INFO:root:Putting elapsed time in queue
INFO:root:Exiting process
INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


INFO 06-22 03:09:35 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.TENSORIZER, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-22 03:09:41 tensorizer.py:344] Deserialized 16.1 GB in 5.37s, 3.0 GB/s
INFO 06-22 03:09:41 tensorizer.py:346] Memory usage before: CPU: (maxrss: 1,591MiB F: 9,271MiB) GPU: (U: 814MiB F: 80,414MiB T: 81,228MiB) TORCH: (R: 22MiB/22MiB, A: 2MiB/10MiB)
INFO 06-22 03:09:41 tensorizer.py:347] Memory usage after: CPU: (maxrss: 6,456MiB F: 5,400MiB) GPU: (U: 16,172MiB F: 65,056MiB T: 81,228MiB) TORCH: (R: 15,374MiB/15,374MiB, A: 15,318MiB/15,321MiB)
INFO 06-22 03:09:42 model_runner.py:146] Loading model weights took 14.9595 GB
INFO 06-22 03:09:43 gpu_executor.py:83] # GPU blocks: 27954, # CPU blocks: 2048
INFO 06-22 03:09:44 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-22 03:09:44 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decrea

INFO:root:Putting elapsed time in queue
INFO:root:Exiting process
INFO:root:Initializing AsyncEngineArgs
INFO:root:Initializing AsyncLLMEngine


INFO 06-22 03:09:50 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.TENSORIZER, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-22 03:09:57 tensorizer.py:344] Deserialized 16.1 GB in 5.42s, 3.0 GB/s
INFO 06-22 03:09:57 tensorizer.py:346] Memory usage before: CPU: (maxrss: 1,592MiB F: 9,274MiB) GPU: (U: 814MiB F: 80,414MiB T: 81,228MiB) TORCH: (R: 22MiB/22MiB, A: 2MiB/10MiB)
INFO 06-22 03:09:57 tensorizer.py:347] Memory usage after: CPU: (maxrss: 6,457MiB F: 5,431MiB) GPU: (U: 16,172MiB F: 65,056MiB T: 81,228MiB) TORCH: (R: 15,374MiB/15,374MiB, A: 15,318MiB/15,321MiB)
INFO 06-22 03:09:57 model_runner.py:146] Loading model weights took 14.9595 GB
INFO 06-22 03:09:58 gpu_executor.py:83] # GPU blocks: 27954, # CPU blocks: 2048
INFO 06-22 03:09:59 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-22 03:09:59 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decrea

INFO:root:Putting elapsed time in queue
INFO:root:Exiting process


Elapsed time 1: 14.92 seconds
Elapsed time 2: 14.09 seconds
Elapsed time 3: 14.09 seconds
Average elapsed time: 14.37 seconds
