### LLM Inference CPU

In [1]:
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer # Import TextStreamer

# --- CONFIG ---
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
QUERY = "Explain MLOps briefly and give 3 real-world examples."
device_choice = "cpu"   # "cpu" | "cuda" | "tpu"

# --- DEVICE SETUP ---
if device_choice not in ["cpu", "cuda", "tpu"]:
    raise ValueError("device_choice must be one of 'cpu', 'cuda', or 'tpu'.")

if device_choice == "cuda":
    if not torch.cuda.is_available():
        raise EnvironmentError("CUDA not available — please switch to CPU or TPU.")
    device = torch.device("cuda")
elif device_choice == "tpu":
    try:
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()
        print("Using TPU device.")
    except Exception as e:
        raise EnvironmentError("TPU runtime not found. Did you enable TPU in Colab?") from e
else:
    device = torch.device("cpu")

# --- LOAD TOKENIZER SAFELY (no chat-template fetch) ---
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    use_fast=True,
    trust_remote_code=True,
    local_files_only=False,
)
# Prevent future chat-template lookup attempts
if hasattr(tokenizer, "_set_chat_template"):
    tokenizer._set_chat_template(None, "remove")

# --- LOAD MODEL ON SELECTED DEVICE ---
print(f"Loading model on {device_choice.upper()}...")

dtype = torch.float16 if device_choice == "cuda" else torch.float32

if device_choice == "cuda":
    try:
        from transformers import BitsAndBytesConfig
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            quantization_config=bnb_config,
            device_map={"": "cuda"},
            torch_dtype=torch.float16,
            trust_remote_code=True,
        )
        print("Model loaded in 4-bit quantized GPU mode.")
    except Exception:
        print("bitsandbytes not installed — loading model in full precision.")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.float16,
            trust_remote_code=True,
        ).to(device)
elif device_choice == "tpu":
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,
        trust_remote_code=True,
    ).to(device)
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,
        trust_remote_code=True,
    ).to(device)

model.eval()

# --- PROMPT PREPARATION ---
system_prompt = "You are a concise and knowledgeable AI assistant."
prompt = (
    f"<|system|>\n{system_prompt}\n</s>\n"
    f"<|user|>\n{QUERY}\n</s>\n"
    f"<|assistant|>\n"
)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# --- GENERATION ---
gen_kwargs = dict(max_new_tokens=200, temperature=0.2, top_p=0.95, do_sample=True)

print("\n--- Generating response ---")
start = time.perf_counter()

# Initialize the TextStreamer for word-by-word printing
# We skip the prompt's tokens in the streamer
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Variables to store generated tokens count and output text for stats
output_ids = None
generated_text = ""

with torch.no_grad():
    if device_choice == "tpu":
        # TPU generation usually doesn't support the streamer seamlessly with the same API
        # Fall back to the original method for output_ids and decode/print afterwards
        output_ids = model.generate(**inputs, **gen_kwargs)
        import torch_xla.core.xla_model as xm
        xm.mark_step()
        # Decode and print the text for TPU case
        generated_text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
        print("\n--- Model Output ---\n")
        print(generated_text.strip())
    else:
        # For CPU/CUDA, use the streamer to print output word-by-word
        print("\n--- Model Output ---\n")
        # Generate and stream output. The output_ids returned will be the full sequence.
        output_ids = model.generate(
            **inputs,
            **gen_kwargs,
            streamer=streamer, # Pass the streamer
        )
        # Since the streamer prints, we just need to decode the output_ids
        # to get the full text for the stats calculation if needed.
        # The streamer has already printed the text.
        generated_text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)


end = time.perf_counter()

# --- TIMING ---
elapsed_s = end - start
# Use output_ids.shape[1] if available, otherwise assume length of generated_text for approximation (less accurate)
generated_tokens = output_ids.shape[1] - inputs["input_ids"].shape[1] if output_ids is not None else len(tokenizer.encode(generated_text, add_special_tokens=False))
tps = generated_tokens / elapsed_s if elapsed_s > 0 else float("nan")

print("\n--- Stats ---")
print(f"Prompt tokens: {inputs['input_ids'].shape[1]}")
print(f"Generated tokens: {generated_tokens}")
print(f"Total time: {elapsed_s:.3f} s")
print(f"Throughput: {tps:.2f} tokens/s on {device_choice.upper()}")

Loading model on CPU...

--- Generating response ---

--- Model Output ---

MLOps is a process of automating the deployment, monitoring, and maintenance of machine learning (ML) models. It involves the following steps:

1. Model Development: The first step in MLOps is model development. This involves creating the training data, selecting the model architecture, and optimizing the hyperparameters.

2. Model Deployment: Once the model is developed, it needs to be deployed to production. This involves setting up a production environment, configuring the deployment pipeline, and monitoring the model's performance.

3. Monitoring and Maintenance: The model needs to be monitored and maintained throughout its lifecycle. This involves monitoring the model's performance, detecting errors, and fixing them.

Here are three real-world examples of MLOps:

1. Netflix: Netflix is a streaming service that uses machine learning to recommend movies and TV shows to its users. Netfli

--- Stats ---
Prompt