In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Load Hugging Face API key from environment (do NOT hardcode your token here).
import os
import logging, warnings
from transformers import logging as hf_logging

# Silence transformers/TRL logs early
hf_logging.set_verbosity_error()
logging.getLogger("trl").setLevel(logging.ERROR)

# Hide specific noisy warnings
warnings.filterwarnings(
    "ignore",
    message=r".*loss_type=None.*ForCausalLMLoss.*",
    category=UserWarning,
)
warnings.filterwarnings(
    "ignore",
    message=r".*cuDNN SDPA backward got grad_output\.strides\(\) != output\.strides\(\).*",
    category=UserWarning,
)
os.environ["TQDM_NOTEBOOK"] = "0"  

from huggingface_hub import login
from dotenv import load_dotenv

# Load .env file (if present)
load_dotenv()
hf_key = os.environ.get("HUGGINGFACE_API_KEY")
if hf_key:
    login(hf_key)
else:
    raise EnvironmentError("HUGGINGFACE_API_KEY not found. Copy .env.template to .env and add your token. See Instruction.md")


In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    device_map="auto",
    torch_dtype=torch.float16
)

In [None]:
# Prompt
prompt = "Create a podcast intro about AI ethics with a guest scientist."

In [None]:
# Inference 
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
output = model.generate(**inputs, max_new_tokens=150)
if DEVICE.startswith("cuda"):
    torch.cuda.synchronize()

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Podcast Clip:\n")
print(generated_text)

In [None]:
models = {
    "llama_8b": "meta-llama/Llama-3.1-8B-Instruct",
    "gemma_2b": "google/gemma-2-2b-it"
}

tokenizers = {}
llms = {}

for name, path in models.items():
    print(f"Loading {name}...")
    tokenizers[name] = AutoTokenizer.from_pretrained(path)
    llms[name] = AutoModelForCausalLM.from_pretrained(
        path,
        device_map="auto",
        torch_dtype=torch.float16
    )

In [None]:
def measure_llm_latency(model, tokenizer, prompt, max_tokens=200):
    # Warm-up
    _ = model.generate(**tokenizer(prompt, return_tensors="pt").to(DEVICE), max_new_tokens=5)
    
    start = time.time()
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    output = model.generate(**inputs, max_new_tokens=max_tokens)
    end = time.time()

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    latency = end - start
    return latency, generated_text


In [None]:
prompts = [
    # Prompt 1 
    "Create a podcast intro about AI ethics with a guest scientist.",
    # Prompt 2
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    # Prompt 3
    "Podcast clip about football news with enthusiastic commentary.",
    
    # For now we repeat the prompts 1, 2, 3, to test the latency.
    "Create a podcast intro about AI ethics with a guest scientist.",
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    "Podcast clip about football news with enthusiastic commentary.",

    # For now we repeat the prompts 1, 2, 3, to test the latency.
    "Create a podcast intro about AI ethics with a guest scientist.",
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    "Podcast clip about football news with enthusiastic commentary.",

    # For now we repeat the prompts 1, 2, 3 to test the latency.
    "Create a podcast intro about AI ethics with a guest scientist.",
    "Generate a fantasy podcast clip where Gandalf gives life advice.",
    "Podcast clip about football news with enthusiastic commentary.",
]

In [None]:
llama_latencies = []
gemma_latencies = []

for prompt in prompts:
    t, _ = measure_llm_latency(llms["llama_8b"], tokenizers["llama_8b"], prompt)
    llama_latencies.append(t)
    
    t, _ = measure_llm_latency(llms["gemma_2b"], tokenizers["gemma_2b"], prompt)
    gemma_latencies.append(t)


In [None]:
import pandas as pd

df = pd.DataFrame({
    "prompt": prompts,
    "lat_llama_8b": llama_latencies,
    "lat_gemma_2b": gemma_latencies
})

df