# CPU & GPU & TPU Roles in MLOps 

### Serdar Biçici 150210331

In [1]:
# set seed
import random
random.seed(42)
import numpy as np
np.random.seed(42)
import torch
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)


## CPU Experiments

### Neural Networks

In [2]:
# pip install -q "torch>=2.1" "torchvision>=0.16" "transformers>=4.41"  # transformers not required; kept for parity
# For TPU: pip install -q torch_xla[tpu]~=2.1

import time
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# --- CONFIG ---
device_choice = "cpu"   # "cpu" | "cuda" | "tpu"
batch_size = 128
epochs = 1              # keep small for quick sanity runs; increase as needed
lr = 1e-3
seed = 42

torch.manual_seed(seed)

# --- Device selection & checks ---
xm = None
if device_choice not in ["cpu", "cuda", "tpu"]:
    raise ValueError("device_choice must be 'cpu', 'cuda', or 'tpu'.")

if device_choice == "cuda":
    if not torch.cuda.is_available():
        raise EnvironmentError("CUDA not available but 'cuda' was requested.")
    device = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
elif device_choice == "tpu":
    try:
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()
    except Exception as e:
        raise EnvironmentError(
            "TPU/XLA not available. Ensure TPU runtime and torch_xla are installed."
        ) from e
else:
    device = torch.device("cpu")

dtype = torch.float32  # use fp32 for reliability across backends

# --- Data ---
tfm = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),  # standard MNIST normalization
])

train_ds = datasets.MNIST(root="./data", train=True, download=True, transform=tfm)
test_ds  = datasets.MNIST(root="./data", train=False, download=True, transform=tfm)

# sensible loader defaults across devices
num_workers = 2 if device_choice in ["cpu", "cuda"] else 0
pin_memory = True if device_choice == "cuda" else False

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, pin_memory=pin_memory, drop_last=False)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=pin_memory, drop_last=False)

# --- Model ---
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)   # 28x28 -> 28x28
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)  # 28x28 -> 28x28
        self.pool  = nn.MaxPool2d(2)                  # 28x28 -> 14x14
        self.drop1 = nn.Dropout(0.25)
        self.fc1   = nn.Linear(64 * 14 * 14, 128)
        self.drop2 = nn.Dropout(0.5)
        self.fc2   = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.drop1(x)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.drop2(x)
        x = self.fc2(x)
        return x

model = SimpleCNN().to(device=device, dtype=dtype)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# --- Training ---
def train_one_epoch(loader):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    start = time.perf_counter()
    for xb, yb in loader:
        xb = xb.to(device=device, dtype=dtype, non_blocking=True)
        yb = yb.to(device=device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()

        if device_choice == "tpu":
            # XLA-aware optimizer step
            xm.optimizer_step(optimizer, barrier=True)
            xm.mark_step()
        else:
            optimizer.step()

        running_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)

    end = time.perf_counter()
    avg_loss = running_loss / max(total, 1)
    acc = correct / max(total, 1)
    return avg_loss, acc, end - start

# --- Evaluation (Inference on test set) ---
@torch.no_grad()
def evaluate(loader):
    model.eval()
    correct = 0
    total = 0

    start = time.perf_counter()
    for xb, yb in loader:
        xb = xb.to(device=device, dtype=dtype, non_blocking=True)
        yb = yb.to(device=device, non_blocking=True)
        logits = model(xb)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)

        if device_choice == "tpu":
            # ensure execution advances on TPU
            xm.mark_step()
    end = time.perf_counter()

    acc = correct / max(total, 1)
    elapsed = end - start
    tok_per_s = total / elapsed if elapsed > 0 else float("nan")  # samples/sec
    return acc, elapsed, tok_per_s

print(f"Device: {device_choice.upper()}  |  dtype: {dtype}  |  epochs: {epochs}")
for ep in range(1, epochs + 1):
    tr_loss, tr_acc, tr_time = train_one_epoch(train_loader)
    print(f"[Epoch {ep}] Train loss: {tr_loss:.4f}  |  Train acc: {tr_acc*100:.2f}%  |  Time: {tr_time:.2f}s")

# Test-time inference timing
test_acc, inf_time, samples_per_s = evaluate(test_loader)
print(f"\nTest accuracy: {test_acc*100:.2f}%")
print(f"Inference time (test set): {inf_time:.3f} s for {len(test_ds)} samples")
print(f"Throughput: {samples_per_s:.2f} samples/s on {device_choice.upper()}")


100%|██████████| 9.91M/9.91M [00:00<00:00, 33.8MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 1.12MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 9.88MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 6.86MB/s]

Device: CPU  |  dtype: torch.float32  |  epochs: 1





[Epoch 1] Train loss: 0.2545  |  Train acc: 92.19%  |  Time: 137.93s

Test accuracy: 98.25%
Inference time (test set): 10.324 s for 10000 samples
Throughput: 968.65 samples/s on CPU


### Gradient Boosting

In [3]:
# pip install -q scikit-learn catboost

import time
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool

# --- CONFIG ---
device_choice = "cpu"   # "cpu" | "cuda" | "tpu"
iterations = 1000
depth = 10
learning_rate = 0.1
random_state = 42
test_size = 0.2

# --- Load dataset ---
wine = load_wine()
X = wine.data
y = wine.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
)
print(f"Dataset: wine  |  Train: {X_train.shape}  |  Test: {X_test.shape}")

# --- Select device ---
if device_choice == "tpu":
    print("TPU not supported for gradient boosting. Running on CPU.")
    device_choice = "cpu"

# --- Initialize CatBoost ---
# CatBoost uses 'task_type' to select between CPU and GPU automatically
clf = CatBoostClassifier(
    iterations=iterations,
    depth=depth,
    learning_rate=learning_rate,
    loss_function="MultiClass",
    task_type="GPU" if device_choice == "cuda" else "CPU",
    random_seed=random_state,
    verbose=False,
)

print(f"Implementation: CatBoost ({'GPU' if device_choice == 'cuda' else 'CPU'})")

# --- Train ---
train_pool = Pool(X_train, y_train)
start_train = time.perf_counter()
clf.fit(train_pool)
end_train = time.perf_counter()
train_time = end_train - start_train

# --- Inference ---
test_pool = Pool(X_test, y_test)
start_inf = time.perf_counter()
y_pred = clf.predict(test_pool)
end_inf = time.perf_counter()
inf_time = end_inf - start_inf

# --- Metrics ---
acc = accuracy_score(y_test, y_pred)
samples_per_s = len(y_test) / inf_time if inf_time > 0 else float("nan")
ms_per_sample = inf_time / len(y_test) * 1000.0

print(f"\nDevice choice: {device_choice.upper()}")
print(f"Train time: {train_time:.3f} s on CatBoost ({device_choice.upper()})")
print(f"Inference time: {inf_time:.3f} s for {len(y_test)} samples")
print(f"Throughput: {samples_per_s:.2f} samples/s  |  {ms_per_sample:.3f} ms/sample")
print(f"Test accuracy: {acc*100:.2f}%")


Dataset: wine  |  Train: (142, 13)  |  Test: (36, 13)
Implementation: CatBoost (CPU)

Device choice: CPU
Train time: 25.541 s on CatBoost (CPU)
Inference time: 0.003 s for 36 samples
Throughput: 13076.07 samples/s  |  0.076 ms/sample
Test accuracy: 97.22%


### LLM Inference

In [1]:
# pip install -q "transformers>=4.41" "torch>=2.1" sentencepiece
# Optional: pip install -q bitsandbytes accelerate  # for 4-bit GPU quantization
# For TPU: pip install -q torch_xla[tpu]~=2.1
!pip install "transformers==4.41.2"

import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# --- CONFIG ---
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
QUERY = "Explain MLOps briefly and give 3 real-world examples."
device_choice = "cpu"   # "cpu" | "cuda" | "tpu"

# --- DEVICE SETUP ---
if device_choice not in ["cpu", "cuda", "tpu"]:
    raise ValueError("device_choice must be one of 'cpu', 'cuda', or 'tpu'.")

if device_choice == "cuda":
    if not torch.cuda.is_available():
        raise EnvironmentError("CUDA not available — please switch to CPU or TPU.")
    device = torch.device("cuda")
elif device_choice == "tpu":
    try:
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()
        print("Using TPU device.")
    except Exception as e:
        raise EnvironmentError("TPU runtime not found. Did you enable TPU in Colab?") from e
else:
    device = torch.device("cpu")

# --- LOAD TOKENIZER SAFELY (no chat-template fetch) ---
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    use_fast=True,
    trust_remote_code=True,
    local_files_only=False,
)
# Prevent future chat-template lookup attempts
if hasattr(tokenizer, "_set_chat_template"):
    tokenizer._set_chat_template(None, "remove")

# --- LOAD MODEL ON SELECTED DEVICE ---
print(f"Loading model on {device_choice.upper()}...")

dtype = torch.float16 if device_choice == "cuda" else torch.float32

if device_choice == "cuda":
    try:
        from transformers import BitsAndBytesConfig
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            quantization_config=bnb_config,
            device_map={"": "cuda"},
            torch_dtype=torch.float16,
            trust_remote_code=True,
        )
        print("Model loaded in 4-bit quantized GPU mode.")
    except Exception:
        print("bitsandbytes not installed — loading model in full precision.")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.float16,
            trust_remote_code=True,
        ).to(device)
elif device_choice == "tpu":
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,
        trust_remote_code=True,
    ).to(device)
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,
        trust_remote_code=True,
    ).to(device)

model.eval()

# --- PROMPT PREPARATION ---
system_prompt = "You are a concise and knowledgeable AI assistant."
prompt = (
    f"<|system|>\n{system_prompt}\n</s>\n"
    f"<|user|>\n{QUERY}\n</s>\n"
    f"<|assistant|>\n"
)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# --- GENERATION ---
gen_kwargs = dict(max_new_tokens=200, temperature=0.2, top_p=0.95, do_sample=True)

print("\n--- Generating response ---")
start = time.perf_counter()
with torch.no_grad():
    if device_choice == "tpu":
        output_ids = model.generate(**inputs, **gen_kwargs)
        import torch_xla.core.xla_model as xm
        xm.mark_step()
    else:
        output_ids = model.generate(**inputs, **gen_kwargs)
end = time.perf_counter()

# --- DECODE ---
generated_text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print("\n--- Model Output ---\n")
print(generated_text.strip())

# --- TIMING ---
elapsed_s = end - start
generated_tokens = output_ids.shape[1] - inputs["input_ids"].shape[1]
tps = generated_tokens / elapsed_s if elapsed_s > 0 else float("nan")

print("\n--- Stats ---")
print(f"Prompt tokens: {inputs['input_ids'].shape[1]}")
print(f"Generated tokens: {generated_tokens}")
print(f"Total time: {elapsed_s:.3f} s")
print(f"Throughput: {tps:.2f} tokens/s on {device_choice.upper()}")


Collecting transformers==4.41.2
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers==4.41.2)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.2)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_6

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Loading model on CPU...


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


--- Generating response ---

--- Model Output ---

MLOps is a process that involves automating the deployment, monitoring, and scaling of machine learning (ML) models. It is a critical component of modern data science and machine learning workflows. MLOps is a holistic approach that involves the following steps:

1. Model Development: The first step in MLOps is model development. This involves creating a model that can be trained on the data and deployed to the production environment.

2. Model Deployment: Once the model is developed, it needs to be deployed to the production environment. This involves configuring the infrastructure, setting up the environment, and deploying the model.

3. Monitoring and Scaling: MLOps involves monitoring and scaling the model to ensure that it is performing optimally. This involves monitoring the model's performance, identifying issues, and scaling the model to meet the demand.

3 Real-World Examples:

1. Net

--- Stats ---
Prompt tokens: 55
Generate

## GPU Experiments

### Neural Networks

In [1]:
# pip install -q "torch>=2.1" "torchvision>=0.16" "transformers>=4.41"  # transformers not required; kept for parity
# For TPU: pip install -q torch_xla[tpu]~=2.1

import time
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# --- CONFIG ---
device_choice = "cuda"   # "cpu" | "cuda" | "tpu"
batch_size = 128
epochs = 1              # keep small for quick sanity runs; increase as needed
lr = 1e-3
seed = 42

torch.manual_seed(seed)

# --- Device selection & checks ---
xm = None
if device_choice not in ["cpu", "cuda", "tpu"]:
    raise ValueError("device_choice must be 'cpu', 'cuda', or 'tpu'.")

if device_choice == "cuda":
    if not torch.cuda.is_available():
        raise EnvironmentError("CUDA not available but 'cuda' was requested.")
    device = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
elif device_choice == "tpu":
    try:
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()
    except Exception as e:
        raise EnvironmentError(
            "TPU/XLA not available. Ensure TPU runtime and torch_xla are installed."
        ) from e
else:
    device = torch.device("cpu")

dtype = torch.float32  # use fp32 for reliability across backends

# --- Data ---
tfm = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),  # standard MNIST normalization
])

train_ds = datasets.MNIST(root="./data", train=True, download=True, transform=tfm)
test_ds  = datasets.MNIST(root="./data", train=False, download=True, transform=tfm)

# sensible loader defaults across devices
num_workers = 2 if device_choice in ["cpu", "cuda"] else 0
pin_memory = True if device_choice == "cuda" else False

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, pin_memory=pin_memory, drop_last=False)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=pin_memory, drop_last=False)

# --- Model ---
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)   # 28x28 -> 28x28
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)  # 28x28 -> 28x28
        self.pool  = nn.MaxPool2d(2)                  # 28x28 -> 14x14
        self.drop1 = nn.Dropout(0.25)
        self.fc1   = nn.Linear(64 * 14 * 14, 128)
        self.drop2 = nn.Dropout(0.5)
        self.fc2   = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.drop1(x)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.drop2(x)
        x = self.fc2(x)
        return x

model = SimpleCNN().to(device=device, dtype=dtype)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# --- Training ---
def train_one_epoch(loader):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    start = time.perf_counter()
    for xb, yb in loader:
        xb = xb.to(device=device, dtype=dtype, non_blocking=True)
        yb = yb.to(device=device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()

        if device_choice == "tpu":
            # XLA-aware optimizer step
            xm.optimizer_step(optimizer, barrier=True)
            xm.mark_step()
        else:
            optimizer.step()

        running_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)

    end = time.perf_counter()
    avg_loss = running_loss / max(total, 1)
    acc = correct / max(total, 1)
    return avg_loss, acc, end - start

# --- Evaluation (Inference on test set) ---
@torch.no_grad()
def evaluate(loader):
    model.eval()
    correct = 0
    total = 0

    start = time.perf_counter()
    for xb, yb in loader:
        xb = xb.to(device=device, dtype=dtype, non_blocking=True)
        yb = yb.to(device=device, non_blocking=True)
        logits = model(xb)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)

        if device_choice == "tpu":
            # ensure execution advances on TPU
            xm.mark_step()
    end = time.perf_counter()

    acc = correct / max(total, 1)
    elapsed = end - start
    tok_per_s = total / elapsed if elapsed > 0 else float("nan")  # samples/sec
    return acc, elapsed, tok_per_s

print(f"Device: {device_choice.upper()}  |  dtype: {dtype}  |  epochs: {epochs}")
for ep in range(1, epochs + 1):
    tr_loss, tr_acc, tr_time = train_one_epoch(train_loader)
    print(f"[Epoch {ep}] Train loss: {tr_loss:.4f}  |  Train acc: {tr_acc*100:.2f}%  |  Time: {tr_time:.2f}s")

# Test-time inference timing
test_acc, inf_time, samples_per_s = evaluate(test_loader)
print(f"\nTest accuracy: {test_acc*100:.2f}%")
print(f"Inference time (test set): {inf_time:.3f} s for {len(test_ds)} samples")
print(f"Throughput: {samples_per_s:.2f} samples/s on {device_choice.upper()}")


100%|██████████| 9.91M/9.91M [00:01<00:00, 5.85MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 154kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.46MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 7.68MB/s]


Device: CUDA  |  dtype: torch.float32  |  epochs: 1
[Epoch 1] Train loss: 0.2370  |  Train acc: 92.73%  |  Time: 9.00s

Test accuracy: 98.35%
Inference time (test set): 1.315 s for 10000 samples
Throughput: 7605.74 samples/s on CUDA


### Gradient Boosting

In [2]:
# pip install -q scikit-learn catboost

import time
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool

# --- CONFIG ---
device_choice = "cuda"   # "cpu" | "cuda" | "tpu"
iterations = 1000
depth = 10
learning_rate = 0.1
random_state = 42
test_size = 0.2

# --- Load dataset ---
wine = load_wine()
X = wine.data
y = wine.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
)
print(f"Dataset: wine  |  Train: {X_train.shape}  |  Test: {X_test.shape}")

# --- Select device ---
if device_choice == "tpu":
    print("TPU not supported for gradient boosting. Running on CPU.")
    device_choice = "cpu"

# --- Initialize CatBoost ---
# CatBoost uses 'task_type' to select between CPU and GPU automatically
clf = CatBoostClassifier(
    iterations=iterations,
    depth=depth,
    learning_rate=learning_rate,
    loss_function="MultiClass",
    task_type="GPU" if device_choice == "cuda" else "CPU",
    random_seed=random_state,
    verbose=False,
)

print(f"Implementation: CatBoost ({'GPU' if device_choice == 'cuda' else 'CPU'})")

# --- Train ---
train_pool = Pool(X_train, y_train)
start_train = time.perf_counter()
clf.fit(train_pool)
end_train = time.perf_counter()
train_time = end_train - start_train

# --- Inference ---
test_pool = Pool(X_test, y_test)
start_inf = time.perf_counter()
y_pred = clf.predict(test_pool)
end_inf = time.perf_counter()
inf_time = end_inf - start_inf

# --- Metrics ---
acc = accuracy_score(y_test, y_pred)
samples_per_s = len(y_test) / inf_time if inf_time > 0 else float("nan")
ms_per_sample = inf_time / len(y_test) * 1000.0

print(f"\nDevice choice: {device_choice.upper()}")
print(f"Train time: {train_time:.3f} s on CatBoost ({device_choice.upper()})")
print(f"Inference time: {inf_time:.3f} s for {len(y_test)} samples")
print(f"Throughput: {samples_per_s:.2f} samples/s  |  {ms_per_sample:.3f} ms/sample")
print(f"Test accuracy: {acc*100:.2f}%")


Dataset: wine  |  Train: (142, 13)  |  Test: (36, 13)
Implementation: CatBoost (GPU)

Device choice: CUDA
Train time: 33.910 s on CatBoost (CUDA)
Inference time: 0.003 s for 36 samples
Throughput: 13490.20 samples/s  |  0.074 ms/sample
Test accuracy: 100.00%


### LLM Inference

In [3]:
# pip install -q "transformers>=4.41" "torch>=2.1" sentencepiece
# Optional: pip install -q bitsandbytes accelerate  # for 4-bit GPU quantization
# For TPU: pip install -q torch_xla[tpu]~=2.1
!pip install "transformers==4.41.2"

import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# --- CONFIG ---
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
QUERY = "Explain MLOps briefly and give 3 real-world examples."
device_choice = "cuda"   # "cpu" | "cuda" | "tpu"

# --- DEVICE SETUP ---
if device_choice not in ["cpu", "cuda", "tpu"]:
    raise ValueError("device_choice must be one of 'cpu', 'cuda', or 'tpu'.")

if device_choice == "cuda":
    if not torch.cuda.is_available():
        raise EnvironmentError("CUDA not available — please switch to CPU or TPU.")
    device = torch.device("cuda")
elif device_choice == "tpu":
    try:
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()
        print("Using TPU device.")
    except Exception as e:
        raise EnvironmentError("TPU runtime not found. Did you enable TPU in Colab?") from e
else:
    device = torch.device("cpu")

# --- LOAD TOKENIZER SAFELY (no chat-template fetch) ---
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    use_fast=True,
    trust_remote_code=True,
    local_files_only=False,
)
# Prevent future chat-template lookup attempts
if hasattr(tokenizer, "_set_chat_template"):
    tokenizer._set_chat_template(None, "remove")

# --- LOAD MODEL ON SELECTED DEVICE ---
print(f"Loading model on {device_choice.upper()}...")

dtype = torch.float16 if device_choice == "cuda" else torch.float32

if device_choice == "cuda":
    try:
        from transformers import BitsAndBytesConfig
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            quantization_config=bnb_config,
            device_map={"": "cuda"},
            torch_dtype=torch.float16,
            trust_remote_code=True,
        )
        print("Model loaded in 4-bit quantized GPU mode.")
    except Exception:
        print("bitsandbytes not installed — loading model in full precision.")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.float16,
            trust_remote_code=True,
        ).to(device)
elif device_choice == "tpu":
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,
        trust_remote_code=True,
    ).to(device)
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,
        trust_remote_code=True,
    ).to(device)

model.eval()

# --- PROMPT PREPARATION ---
system_prompt = "You are a concise and knowledgeable AI assistant."
prompt = (
    f"<|system|>\n{system_prompt}\n</s>\n"
    f"<|user|>\n{QUERY}\n</s>\n"
    f"<|assistant|>\n"
)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# --- GENERATION ---
gen_kwargs = dict(max_new_tokens=200, temperature=0.2, top_p=0.95, do_sample=True)

print("\n--- Generating response ---")
start = time.perf_counter()
with torch.no_grad():
    if device_choice == "tpu":
        output_ids = model.generate(**inputs, **gen_kwargs)
        import torch_xla.core.xla_model as xm
        xm.mark_step()
    else:
        output_ids = model.generate(**inputs, **gen_kwargs)
end = time.perf_counter()

# --- DECODE ---
generated_text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print("\n--- Model Output ---\n")
print(generated_text.strip())

# --- TIMING ---
elapsed_s = end - start
generated_tokens = output_ids.shape[1] - inputs["input_ids"].shape[1]
tps = generated_tokens / elapsed_s if elapsed_s > 0 else float("nan")

print("\n--- Stats ---")
print(f"Prompt tokens: {inputs['input_ids'].shape[1]}")
print(f"Generated tokens: {generated_tokens}")
print(f"Total time: {elapsed_s:.3f} s")
print(f"Throughput: {tps:.2f} tokens/s on {device_choice.upper()}")


Collecting transformers==4.41.2
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers==4.41.2)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.2)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_6

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Loading model on CUDA...
bitsandbytes not installed — loading model in full precision.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


--- Generating response ---

--- Model Output ---

MLOps is a process that automates the deployment, monitoring, and maintenance of machine learning (ML) models. It involves the following steps:

1. Model development: The first step in MLOps is to develop the ML model. This involves collecting data, defining the model architecture, selecting the appropriate algorithms, and training the model.

2. Deployment: Once the model is developed, it needs to be deployed to production. This involves configuring the infrastructure, setting up the environment, and deploying the model.

3. Monitoring: MLOps involves monitoring the model's performance and identifying any issues. This involves collecting metrics, logging, and monitoring the model's behavior.

4. Maintenance: MLOps involves maintaining the model over time. This involves updating the model architecture, fixing bugs, and ensuring that the model is performing optimally.

Here are three real-world

--- Stats ---
Prompt tokens: 55
Generate

## TPU Experiments

### Neural Networks

In [1]:
# pip install -q "torch>=2.1" "torchvision>=0.16" "transformers>=4.41"  # transformers not required; kept for parity
# For TPU: pip install -q torch_xla[tpu]~=2.1

import time
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# --- CONFIG ---
device_choice = "tpu"   # "cpu" | "cuda" | "tpu"
batch_size = 128
epochs = 1              # keep small for quick sanity runs; increase as needed
lr = 1e-3
seed = 42

torch.manual_seed(seed)

# --- Device selection & checks ---
xm = None
if device_choice not in ["cpu", "cuda", "tpu"]:
    raise ValueError("device_choice must be 'cpu', 'cuda', or 'tpu'.")

if device_choice == "cuda":
    if not torch.cuda.is_available():
        raise EnvironmentError("CUDA not available but 'cuda' was requested.")
    device = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
elif device_choice == "tpu":
    try:
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()
    except Exception as e:
        raise EnvironmentError(
            "TPU/XLA not available. Ensure TPU runtime and torch_xla are installed."
        ) from e
else:
    device = torch.device("cpu")

dtype = torch.float32  # use fp32 for reliability across backends

# --- Data ---
tfm = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),  # standard MNIST normalization
])

train_ds = datasets.MNIST(root="./data", train=True, download=True, transform=tfm)
test_ds  = datasets.MNIST(root="./data", train=False, download=True, transform=tfm)

# sensible loader defaults across devices
num_workers = 2 if device_choice in ["cpu", "cuda"] else 0
pin_memory = True if device_choice == "cuda" else False

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, pin_memory=pin_memory, drop_last=False)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=pin_memory, drop_last=False)

# --- Model ---
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)   # 28x28 -> 28x28
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)  # 28x28 -> 28x28
        self.pool  = nn.MaxPool2d(2)                  # 28x28 -> 14x14
        self.drop1 = nn.Dropout(0.25)
        self.fc1   = nn.Linear(64 * 14 * 14, 128)
        self.drop2 = nn.Dropout(0.5)
        self.fc2   = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.drop1(x)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.drop2(x)
        x = self.fc2(x)
        return x

model = SimpleCNN().to(device=device, dtype=dtype)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# --- Training ---
def train_one_epoch(loader):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    start = time.perf_counter()
    for xb, yb in loader:
        xb = xb.to(device=device, dtype=dtype, non_blocking=True)
        yb = yb.to(device=device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()

        if device_choice == "tpu":
            # XLA-aware optimizer step
            xm.optimizer_step(optimizer, barrier=True)
            xm.mark_step()
        else:
            optimizer.step()

        running_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)

    end = time.perf_counter()
    avg_loss = running_loss / max(total, 1)
    acc = correct / max(total, 1)
    return avg_loss, acc, end - start

# --- Evaluation (Inference on test set) ---
@torch.no_grad()
def evaluate(loader):
    model.eval()
    correct = 0
    total = 0

    start = time.perf_counter()
    for xb, yb in loader:
        xb = xb.to(device=device, dtype=dtype, non_blocking=True)
        yb = yb.to(device=device, non_blocking=True)
        logits = model(xb)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += yb.size(0)

        if device_choice == "tpu":
            # ensure execution advances on TPU
            xm.mark_step()
    end = time.perf_counter()

    acc = correct / max(total, 1)
    elapsed = end - start
    tok_per_s = total / elapsed if elapsed > 0 else float("nan")  # samples/sec
    return acc, elapsed, tok_per_s

print(f"Device: {device_choice.upper()}  |  dtype: {dtype}  |  epochs: {epochs}")
for ep in range(1, epochs + 1):
    tr_loss, tr_acc, tr_time = train_one_epoch(train_loader)
    print(f"[Epoch {ep}] Train loss: {tr_loss:.4f}  |  Train acc: {tr_acc*100:.2f}%  |  Time: {tr_time:.2f}s")

# Test-time inference timing
test_acc, inf_time, samples_per_s = evaluate(test_loader)
print(f"\nTest accuracy: {test_acc*100:.2f}%")
print(f"Inference time (test set): {inf_time:.3f} s for {len(test_ds)} samples")
print(f"Throughput: {samples_per_s:.2f} samples/s on {device_choice.upper()}")


  device = xm.xla_device()
E0000 00:00:1761853576.621951      10 common_lib.cc:648] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:238
100%|██████████| 9.91M/9.91M [00:00<00:00, 38.7MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 1.02MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 9.49MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 9.05MB/s]


Device: TPU  |  dtype: torch.float32  |  epochs: 1


  xm.mark_step()


[Epoch 1] Train loss: 0.2437  |  Train acc: 92.51%  |  Time: 48.88s


  xm.mark_step()



Test accuracy: 98.25%
Inference time (test set): 9.993 s for 10000 samples
Throughput: 1000.66 samples/s on TPU
