In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [22]:
# Load Hugging Face API key from environment (do NOT hardcode your token here).
import os
import logging, warnings
from transformers import logging as hf_logging

# Silence transformers/TRL logs early
hf_logging.set_verbosity_error()
logging.getLogger("trl").setLevel(logging.ERROR)

# Hide specific noisy warnings
warnings.filterwarnings(
    "ignore",
    message=r".*loss_type=None.*ForCausalLMLoss.*",
    category=UserWarning,
)
warnings.filterwarnings(
    "ignore",
    message=r".*cuDNN SDPA backward got grad_output\.strides\(\) != output\.strides\(\).*",
    category=UserWarning,
)
os.environ["TQDM_NOTEBOOK"] = "0"  

from huggingface_hub import login
from dotenv import load_dotenv

# Load .env file (if present)
load_dotenv()
hf_key = os.environ.get("HUGGINGFACE_API_KEY")
if hf_key:
    login(hf_key)
else:
    raise EnvironmentError("HUGGINGFACE_API_KEY not found. Copy .env.template to .env and add your token. See Instruction.md")


In [23]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    device_map="auto",
    torch_dtype=torch.float16
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:26<00:00,  6.55s/it]


In [24]:
# Prompt
prompt = "Create a podcast intro about AI ethics with a guest scientist."

In [25]:
# Inference 
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
output = model.generate(**inputs, max_new_tokens=150)
if DEVICE.startswith("cuda"):
    torch.cuda.synchronize()

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Podcast Clip:\n")
print(generated_text)

Latency: 9.05s
Generated Podcast Clip:

Create a podcast intro about AI ethics with a guest scientist. Dr. Rachel Kim is a leading expert in AI ethics and will discuss the importance of responsible AI development and deployment. She will share her insights on how to balance the benefits of AI with its potential risks and challenges.

Here's a sample podcast intro:

**[Upbeat background music starts playing]**

Host: Welcome to "The Future of Tech", a podcast where we explore the latest advancements in technology and their impact on society. I'm your host, [Name], and today we're talking about AI ethics with a very special guest, Dr. Rachel Kim. Dr. Kim is a renowned expert in AI ethics and has worked with top tech companies to develop responsible AI practices. Welcome to the show, Dr. Kim!

**[Music transitions to


In [4]:
models = {
    "llama_8b": "meta-llama/Llama-3.1-8B-Instruct",
    "gemma_2b": "google/gemma-2-2b-it"
}

tokenizers = {}
llms = {}

for name, path in models.items():
    print(f"Loading {name}...")
    tokenizers[name] = AutoTokenizer.from_pretrained(path)
    llms[name] = AutoModelForCausalLM.from_pretrained(
        path,
        device_map="auto",
        torch_dtype=torch.float16
    )

Loading llama_8b...



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s][A
Loading checkpoint shards:  25%|██▌       | 1/4 [00:05<00:17,  5.71s/it][A
Loading checkpoint shards:  50%|█████     | 2/4 [00:11<00:11,  5.69s/it][A
Loading checkpoint shards:  75%|███████▌  | 3/4 [00:17<00:05,  5.97s/it][A
Loading checkpoint shards: 100%|██████████| 4/4 [00:20<00:00,  5.04s/it][A


Loading gemma_2b...



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A
Loading checkpoint shards:  50%|█████     | 1/2 [00:06<00:06,  6.76s/it][A
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.69s/it][A


In [5]:
def measure_llm_latency(model, tokenizer, prompt, max_tokens=200):
    # Warm-up
    _ = model.generate(**tokenizer(prompt, return_tensors="pt").to(DEVICE), max_new_tokens=5)
    
    start = time.time()
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    output = model.generate(**inputs, max_new_tokens=max_tokens)
    end = time.time()

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    latency = end - start
    return latency, generated_text


In [6]:
prompts = [
    # Prompt 1 
    "Create a podcast intro about AI ethics with a guest scientist.",
    # Prompt 2
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    # Prompt 3
    "Podcast clip about football news with enthusiastic commentary.",
    
    # For now we repeat the prompts 1, 2, 3, to test the latency.
    "Create a podcast intro about AI ethics with a guest scientist.",
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    "Podcast clip about football news with enthusiastic commentary.",

    # For now we repeat the prompts 1, 2, 3, to test the latency.
    "Create a podcast intro about AI ethics with a guest scientist.",
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    "Podcast clip about football news with enthusiastic commentary.",

    # For now we repeat the prompts 1, 2, 3 to test the latency.
    "Create a podcast intro about AI ethics with a guest scientist.",
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    "Podcast clip about football news with enthusiastic commentary.",
]

In [7]:
llama_latencies = []
gemma_latencies = []

for prompt in prompts:
    t, _ = measure_llm_latency(llms["llama_8b"], tokenizers["llama_8b"], prompt)
    llama_latencies.append(t)
    
    t, _ = measure_llm_latency(llms["gemma_2b"], tokenizers["gemma_2b"], prompt)
    gemma_latencies.append(t)


In [8]:
import pandas as pd

df = pd.DataFrame({
    "prompt": prompts,
    "lat_llama_8b": llama_latencies,
    "lat_gemma_2b": gemma_latencies
})

df

Unnamed: 0,prompt,lat_llama_8b,lat_gemma_2b
0,Create a podcast intro about AI ethics with a ...,5.968327,21.824368
1,Generate a fantasy podcast clip where Gandalf ...,5.968938,2.080479
2,Podcast clip about football news with enthusia...,5.953359,2.083608
3,Create a podcast intro about AI ethics with a ...,5.948627,2.089503
4,Generate a fantasy podcast clip where Gandalf ...,5.95394,2.097501
5,Podcast clip about football news with enthusia...,5.948224,2.10078
6,Create a podcast intro about AI ethics with a ...,5.945046,2.103452
7,Generate a fantasy podcast clip where Gandalf ...,5.950995,2.109144
8,Podcast clip about football news with enthusia...,5.96228,2.10871
9,Create a podcast intro about AI ethics with a ...,5.950416,2.114704
