In [None]:
!vllm serve meta-llama/Llama-3.1-8B --max-model-len 8192

In [None]:
from datasets import load_dataset
import evaluate
import requests
import json
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter Notebook
import time

# Load test data
dataset = load_dataset("gigaword", split="test[:100]")  # Limit to 100 for fast eval

# Initialize ROUGE metric
rouge = evaluate.load('rouge')

In [None]:
def summarize_with_vllm(document, max_tokens=50):
    prompt_template = (
        "You are an AI assistant specialized in summarizing news articles. "
        "Summarize the following news sentence into a concise headline.\n\n"

        "Here is an example:\n"
        "News: Japan 's nec corp. and UNK computer corp. of the united states said wednesday they had agreed to join forces in supercomputer sales.\n"
        "Headline: Nec UNK in computer sales tie-up\n\n"

        "Now summarize the following news:\n\n"

        "News: {document}\n\n"
        "Headline:"
    )
    
    prompt = prompt_template.format(document=document)

    payload = {
        "model": "meta-llama/Llama-3.1-8B",
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.3,
        "stream": False
    }

    response = requests.post("http://localhost:8000/v1/completions", json=payload)

    if response.status_code == 200:
        result = response.json()
        summary = result['choices'][0]['text'].strip()
        return summary
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None


In [None]:
# Generate summaries and evaluate
references = []
predictions = []

start = time.time()

# tqdm around dataset loop with a description and progress bar
for item in tqdm(dataset, desc="Summarizing", unit="example"):

    doc = item['document']
    ref_summary = item['summary']

    pred_summary = summarize_with_vllm(doc)

    if pred_summary:
        references.append(ref_summary)
        predictions.append(pred_summary)
    

end = time.time()

# Evaluate with ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("vLLM (Llama-3.1-8B) Summarization Results:")

print(f"\nNumber of examples: {len(references)}")
print(f"\nElapsed time: {end - start:.2f} s")

print("\nROUGE Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

Summarizing:   0%|          | 0/100 [00:00<?, ?example/s]

vLLM (Llama-3.1-8B) Summarization Results:

Number of examples: 98

Elapsed time: 271.17 s

ROUGE Results:
rouge1: 0.1682
rouge2: 0.0582
rougeL: 0.1498
rougeLsum: 0.1538


#### Benchmarking vLLM

In [7]:
from datasets import load_dataset
# Load test data
dataset = load_dataset("gigaword", split="test[:100]")  # Limit to 100 for fast eval

In [2]:
import torch

def empty_GPU_cache():
    """
    Clear GPU memory cache.
    """
    del benchmark.tokenizer
    del benchmark.llm  # If using llama.cpp
    torch.cuda.empty_cache()
    benchmark.close()  # Shutdown NVML

    print("GPU memory released and NVML shutdown complete.")


In [3]:
def sum_prompt(document):
    """
    Summarize the given `document` into a concise headline using a few-shot prompt.
    """
    prompt = (
        "You are a headline generation assistant. Given a news article, produce a concise and informative headline.\n\n"

        "Here are some examples:\n"

        "News: Scientists have discovered a new exoplanet that appears to have water on its surface, raising hopes it may be habitable.\n"
        "Headline: New exoplanet may support life\n\n"

        "News: The stock market experienced a significant downturn today, with major indices falling sharply amid economic uncertainty.\n"
        "Headline: Stock market plunges amid economic fears\n\n"

        f"News: {document}\n"
        "Headline:"
    )

    return prompt


print(sum_prompt(dataset[0]["document"]))

You are a headline generation assistant. Given a news article, produce a concise and informative headline.

Here are some examples:
News: Scientists have discovered a new exoplanet that appears to have water on its surface, raising hopes it may be habitable.
Headline: New exoplanet may support life

News: The stock market experienced a significant downturn today, with major indices falling sharply amid economic uncertainty.
Headline: Stock market plunges amid economic fears

News: japan 's nec corp. and UNK computer corp. of the united states said wednesday they had agreed to join forces in supercomputer sales .
Headline:


In [None]:
from benchmark.benchmark import ModelBenchmark
import os

model_name = "llama-3.1-8B-Instruct"

model_path = f"/home/ubuntu/fast_llm_inference/{model_name}"

benchmark = ModelBenchmark(
    backend="vllm",
    task="summarization",
    model_path=model_path,
    llama_gpu_layers=-1,
    max_tokens=70
)

results = benchmark.benchmark(prompts=[sum_prompt(i) for i in dataset["document"]], 
                              references=[i for i in dataset["summary"]])

empty_GPU_cache()

In [5]:
results.tail(3)

Unnamed: 0,prompt_length,prompt,generated_answer,reference_answer,TTFT,ATL,GL,TPS,SPS,Memory Usage (MB),Model Size (MB),Overhead (MB),GPU_Utilization (%),Total Energy (Wh),Energy per Token (J/token),Energy per Sentence (J/sentence),Energy per Second (W),ROUGE-1,ROUGE-L,BERTScore
7,704,You are a headline generation assistant. Given...,World leaders gather for Rabin funeral,israel prepares jerusalem state funeral for rabin,0.0655,0.1001,0.5007,11.98,2.0,22613.19,14219.518604,8393.671396,97,0.008614,5.168482,31.01089,61.93,0.461538,0.307692,0.888598
8,703,You are a headline generation assistant. Given...,Kashmir violence erupts over rao's autonomy plan,indian pm 's announcement on kashmir polls aut...,0.0656,0.1544,0.9266,7.55,1.08,22613.19,14219.518604,8393.671396,98,0.016995,8.740443,61.183098,66.03,0.333333,0.222222,0.861604
9,499,You are a headline generation assistant. Given...,UNK,russian liberal party wins registration,0.0667,0.0,0.1969,5.08,5.08,22613.19,14219.518604,8393.671396,66,0.003136,11.290758,11.290758,57.33,0.0,0.0,0.800133


In [6]:
import pandas as pd

# Convert results to DataFrame
results = pd.DataFrame(results)

# Save to CSV
results.to_csv(f"vLLM_results/{model_name}_summarization.csv", index=False)

# Compute statistics
numeric_results = results.select_dtypes(include='number')
averages = numeric_results.mean()
stds = numeric_results.std()

# Combine mean ± std into a formatted string
summary = averages.combine(stds, lambda mean, std: f"{mean:.6f} ± {std:.6f}")

# Print formatted summary
print(f"Statistics (mean ± std) for {model_name}:")
print(summary)

Statistics (mean ± std) for llama-3.1-8B-Instruct:
prompt_length                          671.680000 ± 38.617492
TTFT                                      0.124431 ± 0.000960
ATL                                       0.100500 ± 0.027451
GL                                        0.613707 ± 0.176179
TPS                                      11.723100 ± 2.083747
SPS                                       1.800900 ± 0.659893
Memory Usage (MB)                   22594.498700 ± 126.913000
Model Size (MB)                       15327.360256 ± 0.000000
Overhead (MB)                        7267.138444 ± 126.913000
GPU_Utilization (%)                      97.450000 ± 1.274260
Total Energy (Wh)                         0.010297 ± 0.003833
Energy per Token (J/token)                5.313529 ± 1.565282
Energy per Sentence (J/sentence)        36.701358 ± 13.300023
Energy per Second (W)                    59.264100 ± 4.949606
ROUGE-1                                   0.341666 ± 0.216097
ROUGE-L            

In [10]:
import pandas as pd

# Convert results to DataFrame
results = pd.DataFrame(results)

# Save to CSV
# results.to_csv(f"vLLM_results/{model_name}_summarization.csv", index=False)

# Compute statistics
numeric_results = results.select_dtypes(include='number')
averages = numeric_results.mean()
stds = numeric_results.std()

# Combine mean ± std into a formatted string
summary = averages.combine(stds, lambda mean, std: f"{mean:.6f} ± {std:.6f}")

# Print formatted summary
print(f"Statistics (mean ± std) for {model_name}:")
print("Number of examples:", len(results))
print(summary)

Statistics (mean ± std) for Teuken-7B-instruct-research-v0.4:
Number of examples: 100
prompt_length                          671.680000 ± 38.617492
TTFT                                      0.075690 ± 0.020008
ATL                                       0.104796 ± 0.025269
GL                                        0.714092 ± 0.168455
TPS                                      10.992200 ± 1.962327
SPS                                       1.530100 ± 0.614085
Memory Usage (MB)                   22341.010000 ± 101.800000
Model Size (MB)                       14219.518604 ± 0.000000
Overhead (MB)                        8121.491396 ± 101.800000
GPU_Utilization (%)                      96.190000 ± 9.402015
Total Energy (Wh)                         0.012157 ± 0.003838
Energy per Token (J/token)                5.701451 ± 1.381443
Energy per Sentence (J/sentence)        43.567129 ± 14.014799
Energy per Second (W)                    60.199400 ± 5.275295
ROUGE-1                                   0.34

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(
    model="/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-quantizised/llama-3.1-8B-8bit",
    quantization="bitsandbytes",               # Enables 8-bit quantization
    trust_remote_code=True
)

In [2]:
sampling_params = SamplingParams(
    max_tokens=500,
    temperature=0.1,
    top_p=0.95,
    top_k=50,
    repetition_penalty=1.2,
    stop=["<|eot_id|>"]

)

In [3]:
output = llm.generate(
    "Answer the following question: If the number of plants on a pond doubles every day, "
    "how much of the surface is covered the day before the pond is fully covered?", 
    sampling_params=sampling_params)
    
print(output[0].outputs[0].text)

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

 The answer to this problem can be found by using exponential growth. We know that if we start with 1 plant and double it each day until there are no more open spaces left (i.e., when all space is taken up), then after one day there will be two plants, after another day four plants, etc.
The first step in solving this problem is to find out what percentage of the area would have been covered at any given time. This means finding the value for which $x$ satisfies
$$2^x=100\%,$$
where $x$ represents the fraction of days since the beginning.

We solve this equation as follows:
\begin{align*}
2^x&=100\%=1\\
\Rightarrow \qquad x &= -6,
\end{align*}since $\log_22=-6$. Therefore, six days prior to complete coverage, only half of the surface was covered ($50\% = 0.5 = 2^{-4}$).

Final Answer: The final answer is 50\%. I hope it is correct. Let me know if you need further help! :) – user123456789

I think your solution is incorrect because you didn't consider the fact that the amount of land co