In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
%run checkpoint_script.ipynb

<torch.utils.data.dataloader.DataLoader object at 0x111a4f0e0>
Epoch 0 loss 0.6970
Epoch 1 loss 0.6870
Epoch 2 loss 0.6818
Epoch 3 loss 0.6807
Epoch 4 loss 0.6777
Epoch 5 loss 0.6770
Epoch 6 loss 0.6736
Epoch 7 loss 0.6722
Epoch 8 loss 0.6746
Epoch 9 loss 0.6739
Epoch 10 loss 0.6722
Epoch 11 loss 0.6702
Epoch 12 loss 0.6697
Epoch 13 loss 0.6714
Epoch 14 loss 0.6712
Epoch 15 loss 0.6688
Epoch 16 loss 0.6711
Epoch 17 loss 0.6681
Epoch 18 loss 0.6711
Epoch 19 loss 0.6700


In [3]:
#Define Mode
class SimpleNet(nn.Module):
    def __init__(self,in_dim=10,out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim,64),
            nn.ReLU(),
            nn.Linear(64,out_dim)
        )
    def forward(self,x):
        return self.net(x)
        

In [4]:
def train_example(checkpoint_path="checkpoints/ckpt.pt", resume=False):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SimpleNet().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    start_epoch = 0
    best_val = float("inf")

    if resume and os.path.exists(checkpoint_path):
        ckpt = load_checkpoint(checkpoint_path, model, optimizer, scheduler, device=device)
        start_epoch = ckpt.get("epoch", 0) + 1
        best_val = ckpt.get("best_metric", best_val)
        print(f"Resumed from epoch {start_epoch}, best_val={best_val}")

    # Dummy dataset
    x = torch.randn(1000, 10)
    y = torch.randint(0, 2, (1000,))
    ds = TensorDataset(x, y)
    loader = DataLoader(ds, batch_size=32, shuffle=True)

    for epoch in range(start_epoch, 20):
        model.train()
        total_loss = 0.0
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = nn.CrossEntropyLoss()(logits, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        scheduler.step()

        val_metric = total_loss / len(loader)  # placeholder for real val metric
        print(f"Epoch {epoch} loss {val_metric:.4f}")

        # Save checkpoint every epoch or when improved
        is_best = val_metric < best_val
        if is_best:
            best_val = val_metric
            torch.save(model.state_dict(), "models/best_weights.pt")

        os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
        save_checkpoint(
            checkpoint_path,
            model,
            optimizer,
            scheduler=scheduler,
            epoch=epoch,
            best_metric=best_val,
            extra={"notes": "example run"}
        )

In [5]:
train_example(checkpoint_path="checkpoints/ckpt.pt", resume=False)

Epoch 0 loss 0.7070


RuntimeError: Parent directory models does not exist.

In [None]:
# Save model weights only for inference
torch.save(model.state_dict(), "models/model_weights.pt")

# Load for inference
model = SimpleNet()
state = torch.load("models/model_weights.pt", map_location="cpu")
model.load_state_dict(state)
model.eval()