In [None]:
!vllm serve meta-llama/Llama-3.1-8B --max-model-len 8192

In [61]:
from datasets import load_dataset
import evaluate
import requests
import json
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter Notebook
import time

# Load test data
dataset = load_dataset("gigaword", split="test[:100]")  # Limit to 100 for fast eval

# Initialize ROUGE metric
rouge = evaluate.load('rouge')

In [None]:
def summarize_with_vllm(document, max_tokens=50):
    prompt_template = (
        "You are an AI assistant specialized in summarizing news articles. "
        "Summarize the following news sentence into a concise headline.\n\n"

        "Here is an example:\n"
        "News: Japan 's nec corp. and UNK computer corp. of the united states said wednesday they had agreed to join forces in supercomputer sales.\n"
        "Headline: Nec UNK in computer sales tie-up\n\n"

        "Now summarize the following news:\n\n"

        "News: {document}\n\n"
        "Headline:"
    )
    
    prompt = prompt_template.format(document=document)

    payload = {
        "model": "meta-llama/Llama-3.1-8B",
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.3,
        "stream": False
    }

    response = requests.post("http://localhost:8000/v1/completions", json=payload)

    if response.status_code == 200:
        result = response.json()
        summary = result['choices'][0]['text'].strip()
        return summary
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None


In [None]:
# Generate summaries and evaluate
references = []
predictions = []

start = time.time()

# tqdm around dataset loop with a description and progress bar
for item in tqdm(dataset, desc="Summarizing", unit="example"):

    doc = item['document']
    ref_summary = item['summary']

    pred_summary = summarize_with_vllm(doc)

    if pred_summary:
        references.append(ref_summary)
        predictions.append(pred_summary)
    

end = time.time()

# Evaluate with ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("vLLM (Llama-3.1-8B) Summarization Results:")

print(f"\nNumber of examples: {len(references)}")
print(f"\nElapsed time: {end - start:.2f} s")

print("\nROUGE Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

Summarizing:   0%|          | 0/100 [00:00<?, ?example/s]

vLLM (Llama-3.1-8B) Summarization Results:

Number of examples: 98

Elapsed time: 271.17 s

ROUGE Results:
rouge1: 0.1682
rouge2: 0.0582
rougeL: 0.1498
rougeLsum: 0.1538


In [None]:
model_path="/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct"

In [None]:
from benchmark.benchmark import ModelBenchmark
import os

model_name = "llama-3.1-8B-Instruct"

model_path = f"/home/ubuntu/fast_llm_inference/{model_name}"

benchmark = ModelBenchmark(
    backend="vllm",
    task="summarization",
    model_path=model_path,
    llama_gpu_layers=-1,
    max_tokens=70,
    model_size= os.path.getsize(model_path) / 1e6, # in MB
)

results = benchmark.benchmark(["""You are a headline generation assistant. Given a news article, produce a concise and informative headline.

Here are some examples:
News: Scientists have discovered a new exoplanet that appears to have water on its surface, raising hopes it may be habitable.
Headline: New exoplanet may support life

News: The stock market experienced a significant downturn today, with major indices falling sharply amid economic uncertainty.
Headline: Stock market plunges amid economic fears

News: japan 's nec corp. and UNK computer corp. of the united states said wednesday they had agreed to join forces in supercomputer sales .
Headline:"""], ["nec corp. and UNK tie-up in supercomputer sales"])

INFO 04-15 09:24:14 [config.py:689] This model supports multiple tasks: {'generate', 'score', 'reward', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 04-15 09:24:14 [config.py:1901] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-15 09:24:17 [core.py:61] Initializing a V1 LLM engine (v0.8.4) with config: model='/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct', speculative_config=None, tokenizer='/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=Observa

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 04-15 09:24:25 [loader.py:458] Loading weights took 5.41 seconds
INFO 04-15 09:24:25 [gpu_model_runner.py:1291] Model loading took 14.9596 GiB and 5.835499 seconds
INFO 04-15 09:24:35 [backends.py:416] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/02b5663e4f/rank_0_0 for vLLM's torch.compile
INFO 04-15 09:24:35 [backends.py:426] Dynamo bytecode transform time: 9.47 s


[rank0]:W0415 09:24:36.977000 40091 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode


INFO 04-15 09:24:40 [backends.py:132] Cache the graph of shape None for later use
INFO 04-15 09:25:15 [backends.py:144] Compiling a graph for general shape takes 38.99 s
INFO 04-15 09:25:30 [monitor.py:33] torch.compile takes 48.46 s in total
INFO 04-15 09:25:32 [kv_cache_utils.py:634] GPU KV cache size: 27,856 tokens
INFO 04-15 09:25:32 [kv_cache_utils.py:637] Maximum concurrency for 8,192 tokens per request: 3.40x
INFO 04-15 09:26:50 [gpu_model_runner.py:1626] Graph capturing finished in 78 secs, took 0.51 GiB
INFO 04-15 09:26:50 [core.py:163] init engine (profile, create kv cache, warmup model) took 144.93 seconds
INFO 04-15 09:26:50 [core_client.py:435] Core engine process 0 ready.


Using the latest cached version of the module from /home/ubuntu/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--rouge/b01e0accf3bd6dd24839b769a5fda24e14995071570870922c71970b3a6ed886 (last modified on Wed Mar 12 15:05:43 2025) since it couldn't be found locally at evaluate-metric--rouge, or remotely on the Hugging Face Hub.
