In [12]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load a public, lightweight model (facebook/opt-1.3b)
model_name = "facebook/opt-1.3b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto").eval()

# Define input text
input_text = "The future of AI is"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Fix missing pad_token_id warning
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token

# Move tensors to the appropriate device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Without KV Cache
start_time = time.time()
output_no_cache = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=100,
    use_cache=False,
    do_sample=True,
    top_k=50,
    temperature=0.7,
    repetition_penalty=1.2
)
time_without_cache = (time.time() - start_time) * 1000  # Convert to ms

# With KV Cache
start_time = time.time()
output_with_cache = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=100,
    use_cache=True,
    do_sample=True,
    top_k=50,
    temperature=0.7,
    repetition_penalty=1.2
)
time_with_cache = (time.time() - start_time) * 1000  # Convert to ms

# Print results
print(f"Time without KV Cache: {time_without_cache:.2f} ms")
print(f"Time with KV Cache: {time_with_cache:.2f} ms")
print(f"Speed Improvement: {time_without_cache / time_with_cache:.2f}x\n")

# Print generated text
print("\nGenerated Text Without Cache:", tokenizer.decode(output_no_cache[0], skip_special_tokens=True))
print("Generated Text With Cache:", tokenizer.decode(output_with_cache[0], skip_special_tokens=True))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Time without KV Cache: 2036.86 ms
Time with KV Cache: 1317.99 ms
Speed Improvement: 1.55x


Generated Text Without Cache: The future of AI is in the cloud

I know I am going to get a lot of questions from people who are new or have not heard this before. But, if you haven’t seen it yet… The technology we use for machine learning and deep reinforcement learning will be around forever. It has been around since the 1960s and when computers were first invented they did many things that humans couldn't do at all (at least not really). In fact, computers could even play
Generated Text With Cache: The future of AI is here, and it's terrifying
In 2019, Google CEO Sundar Pichai said that his company would invest $15 billion in AI over the next two years. So far this year, Google has made moves to bolster its neural networks research team, adding more engineers than ever before, according to CNBC. And now, Microsoft has announced a new program called Project Baseline that aims to create a platfo