# Introduction

This scripts show how to generate with huggingface generate and vllm

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
os.environ['HF_HUB_CACHE'] = '/next_share/hf_cache/hub'

import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, GenerationMixin, T5ForConditionalGeneration
from vllm import LLM, SamplingParams

## Generate with VLLM

In [2]:
engine = LLM('meta-llama/Meta-Llama-3-8B')

INFO 05-21 17:30:34 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='meta-llama/Meta-Llama-3-8B', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-21 17:30:36 utils.py:660] Found nccl from library /storage/rhshui/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-21 17:30:36 selector.py:27] Using FlashAttention-2 backend.
INFO 05-21 17:30:38 weight_utils.py:199] Using model weights format ['*.safetensors']
INFO 05-21 17:31:22 model_runner.py:175] Loading model weights took 14.9595 GB
INFO 05-21 17:31:24 gpu_executor.py:114] # GPU blocks: 2259, # CPU blocks: 2048
INFO 05-21 17:31:27 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-21 17:31:27 model_runner.py:941] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-21 17:31:31 model_runner.py:1017] Graph capturi

In [3]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.0, top_p=0.95, max_tokens=300, ignore_eos=True)

In [5]:
outputs = [engine.generate(prompt, sampling_params, use_tqdm=False)[0] for prompt in tqdm(prompts)]
# outputs = engine.generate(prompts, sampling_params)

100%|██████████| 4/4 [00:29<00:00,  7.33s/it]


In [6]:
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Prompt: 'Hello, my name is', Generated text: " and I'm writing you today to learn more about the 2019 Ford F-150 XL SuperCrew 4WD. I live at in , and I would like to hear back from you soon and learn more about the Ford F-150. Please call me at at your earliest convenience.://www.google.com/url?q=https://www.automaxx.com/used-vehicles/2019-ford-f-150-xl-supercrew-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4x4-4wd-4"
Prompt: 'The president of the United States is', Generated text: ' the most powerful person in the world. He or she is the leader of the free world, the commander-in-chief of the armed forces, and the head of state of the United States. The president is also the chief executive officer of the federal government and the head of the executive branch of the government. The president is elected b

In [50]:
sampling_params

SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=[], stop_token_ids=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=800, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None)

In [7]:
print([len(output.outputs[0].token_ids) for output in outputs])

[300, 300, 300, 300]


## Generate with Transformers

In [8]:
model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3-8B', torch_dtype = torch.bfloat16, device_map = 1)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
tk = engine.get_tokenizer()
print(tk.eos_token)
tk.pad_token = tk.eos_token
tk.padding_side = 'left'

<|end_of_text|>


In [10]:
inputs = [tk(p, return_tensors='pt') for p in prompts]

In [11]:
def to_cuda(kws, gpu=0):
    return {k:v.cuda(gpu) for k,v in kws.items()}

In [12]:
all_outs = []
for ipt in tqdm(inputs):
    r = model.generate(**to_cuda(ipt, 1), 
                       do_sample = False, max_new_tokens = 300, 
                       eos_token_id = -1,
                       pad_token_id = tk.eos_token_id)
    all_outs.append(r[0])

100%|██████████| 4/4 [00:36<00:00,  9.24s/it]


In [110]:
enc = tk(prompts, return_tensors = 'pt', padding = True)
r_ba = model.generate(**to_cuda(enc, 1),
                      do_sample = False, max_new_tokens = 300, 
                       eos_token_id = -1,
                       pad_token_id = tk.eos_token_id)

In [93]:
print([len(k) for k in r_ba])

[520, 520, 520, 520]


In [87]:
print(tk.decode(r_ba[2]))

<|end_of_text|><|end_of_text|><|begin_of_text|>The capital of France is Paris. It is located in the north of the country. The city is situated on the banks of the Seine River. Paris is the largest city in France. It is also the largest city in the European Union. The city has a population of over 2.2 million people. Paris is a major tourist destination. It is home to many famous landmarks, including the Eiffel Tower, the Louvre Museum, and the Notre Dame Cathedral. The city is also a major center of culture and the arts. Paris is home to many world-renowned museums, galleries, and theaters. The city is also a major center of fashion and design. Paris is a major center of business and finance. The city is home to many multinational corporations and financial institutions. Paris is a major transportation hub. The city is served by two international airports and a major railway station. Paris is a major center of education. The city is home to many universities and colleges. Paris is a ma

In [49]:
tk.eos_token_id

128001

In [97]:
enc.attention_mask[2]

tensor([0, 0, 1, 1, 1, 1, 1, 1])

In [99]:
a = r_ba[2][8:]
b = outputs[2].outputs[0].token_ids
print(len(a), len(b))

512 512


In [100]:
print(tk.decode(a))

 Paris. It is located in the north of the country. The city is situated on the banks of the Seine River. Paris is the largest city in France. It is also the largest city in the European Union. The city has a population of over 2.2 million people. Paris is a major tourist destination. It is home to many famous landmarks, including the Eiffel Tower, the Louvre Museum, and the Notre Dame Cathedral. The city is also a major center of culture and the arts. Paris is home to many world-renowned museums, galleries, and theaters. The city is also a major center of fashion and design. Paris is a major center of business and finance. The city is home to many multinational corporations and financial institutions. Paris is a major transportation hub. The city is served by two international airports and a major railway station. Paris is a major center of education. The city is home to many universities and colleges. Paris is a major center of research and development. The city is home to many resear

In [101]:
print(tk.decode(b))

 Paris. It is located in the north of the country. The city is situated on the banks of the Seine River. Paris is the largest city in France. It is also the largest city in the European Union. The city has a population of over 2.2 million people. Paris is a major tourist destination. It is home to many famous landmarks, including the Eiffel Tower, the Louvre Museum, and the Notre Dame Cathedral. The city is also a major center of culture and the arts. Paris is home to many famous artists, writers, and musicians. The city is also a major center of business and finance. Paris is a major transportation hub. It is served by two international airports, Charles de Gaulle and Orly. The city is also served by a major railway station, Gare de Lyon. Paris is a major center of education. It is home to many universities and colleges. The city is also home to many research institutes and laboratories. Paris is a major center of science and technology. The city is home to many major companies, inclu