<a href="https://colab.research.google.com/github/rahiakela/genai-research-and-practice/blob/main/hands-on-llm-serving-and-optimization/03_llm__serving_with_vLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import gc
import time

# Unload models and clean up gpu memory cache
def free_gpu(model):
  if model:
    # Removes the reference to the model's memory,
    # making it eligible for garbage collection.
    del model

  # Release any cached GPU memory that's no longer needed.
  if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

  # Trigger garbage collection to ensure memory is fully released.
  gc.collect()

free_gpu(None)


In [None]:
!pip install vllm
!pip install transformers

Run Qwen model with vLLM and track the inference time.

In [None]:
import time
from vllm import LLM, SamplingParams

model_name = "Qwen/Qwen2.5-0.5B"

# Load model with vLLM.
llm = LLM(model=model_name, dtype="float16")

# Define the prompt.
prompt = """You are an expert AI historian writing a detailed chapter for a book titled "The Evolution of Human-AI Collaboration."

Begin by summarizing the early stages of artificial intelligence in the 1950s, touching on symbolic logic and rule-based systems. Then transition into the rise of machine learning, particularly deep learning in the 2010s.

Afterward, describe how large language models like GPT transformed human-computer interaction, enabling applications in education, creative writing, customer support, and software development.

Finally, reflect on the societal and ethical implications of AI, such as misinformation, bias, and the alignment problem.

Write in a formal tone, with rich detail and examples in each era."""

# Create sampling parameters.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=128)

# Time the model generation.
start_time = time.time()
outputs = llm.generate([prompt], sampling_params)
end_time = time.time()

# Print the results.
for output in outputs:
  print(f"Generated text: {output}")
  print(f"Time taken: {end_time - start_time:.2f} seconds")

free_gpu(llm)


In [None]:
free_gpu(llm)

In [None]:
Run Qwen model with standard (non-optimial) HuggingFace library and track the inference time.

In [None]:
# --- Basic Model Serving (transformers) ---
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

start_time_basic = time.time()

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B", device_map="auto", trust_remote_code=True)

# Create the pipeline.
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

outputs_basic = generator(prompt, max_length=128, temperature=0.8, top_p=0.95)
end_time_basic = time.time()

print("\n---- Basic Model Serving Results ----")
for output in outputs_basic:
    print(f"Generated text: {output['generated_text']}")
    print(f"Time taken: {end_time_basic - start_time_basic:.2f} seconds")


print(f"\nLatency difference: {(end_time_basic - start_time_basic) - (end_time - start_time):.2f} seconds")

free_gpu(generator)


In [None]:
print(f"\nLatency difference: {(end_time_basic - start_time_basic) - (end_time - start_time):.2f} seconds")
