# Using Open Pre-trained Transformer Language Models (with HF Transformers and Speculative Decoding)

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
HF_CACHE_LOCATION = "/data/shk148/models/opt/cache"
# To pre-cache all the OPT models easily (in parallel),
# use `../precache_opt_models.py HF_CACHE_LOCATION`

# For processing on GPU, we use PyTorch
# TensorFlow is also available, but Transformers' integration with PyTorch is more robust
import torch
torch_device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# Things allocated on GPU is not cleared automaticaly.
# This can be problamatic when running the notebook multiple times,
# Or running it out-of-order.
# `clear_torch()` tries to free some space on the GPU.
import gc
def clear_torch():
    if "assistant_model" in globals() :
        global assistant_model
        del assistant_model
    if "target_model" in globals() :
        global target_model
        del target_model
    if "tokenizer" in globals():
        global tokenizer
        del tokenizer
    if "inputs" in globals():
        global inputs
        del inputs
    if "outputs" in globals():
        global outputs
        del outputs
    gc.collect()
    torch.cuda.empty_cache()
clear_torch()
torch.cuda.memory_reserved()

0

In [3]:
clear_torch()
# This is the Target LLM
checkpoint = "facebook/opt-6.7b"
# Load the model into the GPU
target_model = AutoModelForCausalLM.from_pretrained(checkpoint, cache_dir=HF_CACHE_LOCATION).cuda()
tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=HF_CACHE_LOCATION)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
import time
def evaluate_llm(assistant_checkpoint):
    assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint, cache_dir="/data/shk148/models/opt/cache").cuda() if assistant_checkpoint is not None else None
    # Generate Output
    # Perform inference and measure latency
    num_iterations = 10  # Adjust this based on your requirements
    total_time = 0.0
    if assistant_model is not None:
        for _ in range(num_iterations):
            start_time = time.time()
            outputs = target_model.generate(inputs, assistant_model=assistant_model, do_sample=True,max_new_tokens=100)
            end_time = time.time()
            total_time += end_time - start_time
        del assistant_model
        torch.cuda.empty_cache()
    else:
        start_time = time.time()
        outputs = target_model.generate(inputs, do_sample=True,max_new_tokens=100)
        end_time = time.time()
        total_time += end_time - start_time
        # print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    return (total_time / num_iterations)

In [5]:
prompt = "A monolithic operating system differs"
inputs = tokenizer(prompt, return_tensors="pt").input_ids.cuda()

In [6]:
draft_models = [
	"facebook/opt-125m",
	"facebook/opt-350m",
	"facebook/opt-1.3b",
	"facebook/opt-2.7b",
    None
]
results = dict()
for model in draft_models:
    results[model] = evaluate_llm(model)
print(results)

Calling `_assisted_decoding` directly is deprecated and will be removed in v4.41. Use `generate` or a custom generation loop instead.


{'facebook/opt-125m': 2.4469939708709716, 'facebook/opt-350m': 2.9480308055877686, 'facebook/opt-1.3b': 2.500729870796204, 'facebook/opt-2.7b': 3.301098585128784, None: 0.3623969554901123}
