In [1]:
import os
import random
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
# import torchvision
# import torchvision.transforms as transforms
# import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import wandb

# save weights and biases api key to .env file in project directory
assert os.getenv('WANDB_API_KEY')
assert torch.cuda.is_available()

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)  # noqa: NPY002
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshane-kercheval[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
x, y = fetch_openml('mnist_784', version=1, return_X_y=True, parser='auto')
x = torch.tensor(x.values, dtype=torch.float32)
y = torch.tensor(y.astype(int).values, dtype=torch.long)

# need to make this dynamic based on Fully Connected vs Convolutional
# Reshape data to have channel dimension
# MNIST images are 28x28, so we reshape them to [batch_size, 1, 28, 28]
x = x.reshape(-1, 1, 28, 28)

# 80% train; 10% validation; 10% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print(f"Training set  : X-{x_train.shape}, y-{y_train.shape}")
print(f"Validation set: X-{x_val.shape}, y-{y_val.shape}")
print(f"Test set      : X-{x_test.shape}, y-{y_test.shape}")

Training set  : X-torch.Size([56000, 1, 28, 28]), y-torch.Size([56000])
Validation set: X-torch.Size([7000, 1, 28, 28]), y-torch.Size([7000])
Test set      : X-torch.Size([7000, 1, 28, 28]), y-torch.Size([7000])


In [4]:
class ConvNet(nn.Module):
    """Convolutional neural network (two convolutional layers)."""

    def __init__(self, kernels: list, classes: int = 10):
        super().__init__()

        self.layer1 = nn.Sequential(
            nn.Conv2d(1, kernels[0], kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, kernels[1], kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7 * 7 * kernels[-1], classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass."""
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        return self.fc(out)

In [5]:
def make_loader(x: torch.tensor, y: torch.tensor, batch_size: int) -> DataLoader:
    """Make a DataLoader from a given dataset."""
    return DataLoader(
        dataset=TensorDataset(x, y),
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=2,
    )


def make(config: dict) -> tuple:
    """Make the model, data, and optimization objects."""
    # Make the data
    train_loader = make_loader(x_train, y_train, batch_size=config.batch_size)
    validation_loader = make_loader(x_val, y_val, batch_size=config.batch_size)
    test_loader = make_loader(x_test, y_test, batch_size=config.batch_size)

    # Make the model
    model = ConvNet(config.kernels, config.classes).to(device)

    # Make the loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    return (
        model,
        train_loader,
        validation_loader,
        test_loader,
        criterion,
        optimizer,
    )


def train_log(training_loss: float, validation_loss: float, example_ct: int, epoch: int) -> None:
    """Logs loss to the console and wandb."""
    # Where the magic happens
    wandb.log(
        {
            'epoch': epoch,
            'training_loss': training_loss,
            'validation_loss': validation_loss
        },
        step=example_ct,
    )
    print(
        f"Training/Validation Loss after {str(example_ct).zfill(5)} examples: ",
        f"{training_loss:.3f} | {validation_loss:.3f}",
    )


def calculate_average_loss(
        data_loader: DataLoader,
        model: nn.Module,
        loss_function: callable) -> float:
    """Calculates the average loss over a dataset."""
    running_loss = 0
    total_samples = 0
    with torch.no_grad():
        for x, y in data_loader:
            x, y = x.to(device), y.to(device)
            loss = loss_function(model(x), y)
            # weighted average of the loss adjusted for the batch size
            running_loss += loss.item() * x.shape[0]
            total_samples += x.shape[0]
    return running_loss / total_samples


def train(
        model: nn.Module,
        train_loader: DataLoader,
        validation_loader: DataLoader,
        criterion: callable,
        optimizer: torch.optim.Optimizer,
        config: dict) -> None:
    """Trains the model for the number of epochs specified in the config."""
    model.train()
    # Tell wandb to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, criterion, log="all", log_freq=10)

    # Run training and track with wandb
    example_ct = 0  # number of examples seen
    batch_ct = 0

    for epoch in tqdm(range(config.epochs)):
        running_training_loss = 0
        total_train_samples = 0
        for _, (x_batch, y_batch) in enumerate(train_loader):
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)  # noqa: PLW2901
            # ➡ Forward pass
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            # ⬅ Backward pass & optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            example_ct += len(x_batch)
            batch_ct += 1
            # weighted average of the training loss
            running_training_loss += loss.item() * x_batch.shape[0]
            total_train_samples += x_batch.shape[0]
            # Report metrics every 25th batch
            if ((batch_ct + 1) % 25) == 0:
                avg_training_loss = running_training_loss / total_train_samples
                running_training_loss = 0
                total_train_samples = 0
                model.eval()
                average_validation_loss = calculate_average_loss(
                    validation_loader, model, criterion
                )
                train_log(avg_training_loss, average_validation_loss, example_ct, epoch)
                model.train()


def test(model: nn.Module, test_loader: DataLoader) -> None:
    """Tests the model on the test set. Logs the accuracy to the console and to wandb."""
    model.eval()
    # Run the model on some test examples
    with torch.no_grad():
        correct, total = 0, 0
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)  # noqa: PLW2901
            outputs = model(x)
            _, predicted = torch.max(outputs.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()

        print(f"Accuracy of the model on the {total} test images: {correct / total:%}")
        wandb.log({'test_accuracy': correct / total})

    # Save the model in the exchangeable ONNX format
    torch.onnx.export(model, x, 'model.onnx')
    wandb.save('model.onnx')


def model_pipeline(config: dict) -> nn.Module:
    """Builds the model and runs it."""
    # tell wandb to get started
    with wandb.init(project="pytorch-demo", config=config):
      config = wandb.config
      # make the model, data, and optimization problem
      model, train_loader, validation_loader, test_loader, criterion, optimizer = make(config)
      print(model)
      # and use them to train the model
      train(model, train_loader, validation_loader, criterion, optimizer, config)
      # and test its final performance
      test(model, test_loader)

    return model

In [6]:
config = {
    'epochs': 5,
    'classes': 10,
    'kernels': [16, 32],
    'batch_size': 128,
    'learning_rate': 0.005,
    'dataset': 'MNIST',
    'architecture': 'CNN',
}
# Build, train and analyze the model with the pipeline
model = model_pipeline(config)

ConvNet(
  (layer1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=1568, out_features=10, bias=True)
)


  0%|          | 0/5 [00:00<?, ?it/s]

Training/Validation Loss after 03072 examples:  7.274 | 1.030
Training/Validation Loss after 06272 examples:  0.679 | 0.499
Training/Validation Loss after 09472 examples:  0.377 | 0.352
Training/Validation Loss after 12672 examples:  0.329 | 0.318
Training/Validation Loss after 15872 examples:  0.305 | 0.256
Training/Validation Loss after 19072 examples:  0.283 | 0.249
Training/Validation Loss after 22272 examples:  0.265 | 0.226
Training/Validation Loss after 25472 examples:  0.219 | 0.229
Training/Validation Loss after 28672 examples:  0.215 | 0.199
Training/Validation Loss after 31872 examples:  0.205 | 0.185
Training/Validation Loss after 35072 examples:  0.200 | 0.188
Training/Validation Loss after 38272 examples:  0.178 | 0.197
Training/Validation Loss after 41472 examples:  0.192 | 0.171
Training/Validation Loss after 44672 examples:  0.215 | 0.191
Training/Validation Loss after 47872 examples:  0.160 | 0.144
Training/Validation Loss after 51072 examples:  0.164 | 0.170


 20%|██        | 1/5 [00:05<00:20,  5.17s/it]

Training/Validation Loss after 54272 examples:  0.185 | 0.155
Training/Validation Loss after 57408 examples:  0.139 | 0.143
Training/Validation Loss after 60608 examples:  0.159 | 0.160
Training/Validation Loss after 63808 examples:  0.160 | 0.166
Training/Validation Loss after 67008 examples:  0.165 | 0.168
Training/Validation Loss after 70208 examples:  0.152 | 0.138
Training/Validation Loss after 73408 examples:  0.136 | 0.153
Training/Validation Loss after 76608 examples:  0.136 | 0.154
Training/Validation Loss after 79808 examples:  0.139 | 0.151
Training/Validation Loss after 83008 examples:  0.138 | 0.139
Training/Validation Loss after 86208 examples:  0.139 | 0.136
Training/Validation Loss after 89408 examples:  0.147 | 0.144
Training/Validation Loss after 92608 examples:  0.148 | 0.138
Training/Validation Loss after 95808 examples:  0.139 | 0.144
Training/Validation Loss after 99008 examples:  0.157 | 0.162
Training/Validation Loss after 102208 examples:  0.141 | 0.146
Trainin

 40%|████      | 2/5 [00:10<00:15,  5.22s/it]

Training/Validation Loss after 111808 examples:  0.195 | 0.138
Training/Validation Loss after 114944 examples:  0.112 | 0.132
Training/Validation Loss after 118144 examples:  0.100 | 0.132
Training/Validation Loss after 121344 examples:  0.128 | 0.141
Training/Validation Loss after 124544 examples:  0.133 | 0.120
Training/Validation Loss after 127744 examples:  0.123 | 0.131
Training/Validation Loss after 130944 examples:  0.152 | 0.157
Training/Validation Loss after 134144 examples:  0.131 | 0.144
Training/Validation Loss after 137344 examples:  0.133 | 0.165
Training/Validation Loss after 140544 examples:  0.130 | 0.114
Training/Validation Loss after 143744 examples:  0.096 | 0.120
Training/Validation Loss after 146944 examples:  0.132 | 0.161
Training/Validation Loss after 150144 examples:  0.119 | 0.149
Training/Validation Loss after 153344 examples:  0.127 | 0.133
Training/Validation Loss after 156544 examples:  0.136 | 0.126
Training/Validation Loss after 159744 examples:  0.121 

 60%|██████    | 3/5 [00:15<00:10,  5.27s/it]

Training/Validation Loss after 166144 examples:  0.121 | 0.118
Training/Validation Loss after 169280 examples:  0.083 | 0.134
Training/Validation Loss after 172480 examples:  0.118 | 0.109
Training/Validation Loss after 175680 examples:  0.103 | 0.111
Training/Validation Loss after 178880 examples:  0.134 | 0.126
Training/Validation Loss after 182080 examples:  0.120 | 0.118
Training/Validation Loss after 185280 examples:  0.122 | 0.155
Training/Validation Loss after 188480 examples:  0.111 | 0.125
Training/Validation Loss after 191680 examples:  0.133 | 0.126
Training/Validation Loss after 194880 examples:  0.110 | 0.114
Training/Validation Loss after 198080 examples:  0.113 | 0.143
Training/Validation Loss after 201280 examples:  0.140 | 0.116
Training/Validation Loss after 204480 examples:  0.130 | 0.130
Training/Validation Loss after 207680 examples:  0.116 | 0.148
Training/Validation Loss after 210880 examples:  0.104 | 0.129
Training/Validation Loss after 214080 examples:  0.113 

 80%|████████  | 4/5 [00:20<00:05,  5.23s/it]

Training/Validation Loss after 223680 examples:  0.117 | 0.117
Training/Validation Loss after 226816 examples:  0.117 | 0.118
Training/Validation Loss after 230016 examples:  0.116 | 0.119
Training/Validation Loss after 233216 examples:  0.087 | 0.109
Training/Validation Loss after 236416 examples:  0.121 | 0.151
Training/Validation Loss after 239616 examples:  0.111 | 0.136
Training/Validation Loss after 242816 examples:  0.100 | 0.131
Training/Validation Loss after 246016 examples:  0.104 | 0.125
Training/Validation Loss after 249216 examples:  0.122 | 0.134
Training/Validation Loss after 252416 examples:  0.115 | 0.129
Training/Validation Loss after 255616 examples:  0.117 | 0.117
Training/Validation Loss after 258816 examples:  0.105 | 0.122
Training/Validation Loss after 262016 examples:  0.117 | 0.115
Training/Validation Loss after 265216 examples:  0.117 | 0.118
Training/Validation Loss after 268416 examples:  0.111 | 0.117
Training/Validation Loss after 271616 examples:  0.096 

100%|██████████| 5/5 [00:26<00:00,  5.28s/it]

Training/Validation Loss after 278016 examples:  0.130 | 0.123





Accuracy of the model on the 7000 test images: 96.714286%




0,1
epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆████████
test_accuracy,▁
training_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_loss,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,4.0
test_accuracy,0.96714
training_loss,0.13011
validation_loss,0.12278
