In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# PyTorch on Vertex Experiments: Simple CIFAR10 Example

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/rastringer/vertex-ai-examples/blob/main/pytorch_on_vertex/pytorch_experiments_cifar10.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/rastringer/vertex-ai-examples/blob/main/pytorch_on_vertex/pytorch_experiments_cifar10.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/rastringer/vertex-ai-examples/blob/main/pytorch_on_vertex/pytorch_experiments_cifar10.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

#### Overview

This is a simple example showing how to run experiments on Vertex AI in PyTorch. We use some code from the PyTorch CIFAR10 tutorial found [here](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html) (with thanks to Meta). Vertex Experiments allows us to track and analyze different model architectures, hyper-parameters and training environments.

#### Objective

In this tutorial, we will learn how to experiment with hyper-parameters (epochs and learning rate) and compare results.

We will use the following Google Cloud services and resources:

- *Vertex AI Workbench, Training and Model Registry*
- *Google Cloud Storage*
- *TensorBoard*


#### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* TensorBoard
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),
and [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Prerequisites

This tutorial requires a GCP project, Storage bucket, and Vertex AI and Storage APIs to be enabled. 

Please follow the steps in the [gcp_setup.ipynb](https://github.com/rastringer/vertex-ai-examples/blob/main/pytorch_on_vertex/gcp_setup.ipynb) first if necessary. 

#### Colab only: Uncomment the following cell to restart the kernel.

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

#### Authentication

We may need to autnenticate the environment to your GCP account.

**Vertex AI Workbench**
* You are already authenticated, please skip to "Create a storage bucket..."

**Local JupyterLab instance:** uncomment and run:

In [None]:
# ! gcloud auth login

**Colab**, uncomment and run:

In [None]:
# from google.colab import auth
# auth.authenticate_user()

**Service account** or other
* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples.

In [None]:
PROJECT_ID = "<your-project-id>"# @param {type:"string"}
REGION = "<project-region>"
# Starts with "gs://..."
BUCKET_URI = "<bucket-uri-unique>"

Initialize the Vertex AI SDK for Python.

In [None]:
from google.cloud import aiplatform as vertex_ai
vertex_ai.init(project=PROJECT_ID, location=REGION)

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Let us show some of the training images, for fun.



### Model Class 


In [None]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = Net().to(device)

### Loss function and optimizer
Classification Cross-Entropy loss and SGD with momentum.



In [None]:
import torch.optim as optim

def trainer(epochs, lr):

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9)
    

    for epoch in range(1, epochs + 1):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data[0].to(device), data[1].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0

    print('Finished Training')

In [None]:
def evaluate():
    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            # calculate outputs by running images through the network
            outputs = net(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
    accuracy = 100 * correct // total
    return accuracy

### Train the network

In [None]:
EXPERIMENT_NAME = "cifar10-experiment" 

In [None]:
vertex_ai_tb = vertex_ai.Tensorboard.create()

In [None]:
vertex_ai.init(experiment=EXPERIMENT_NAME, experiment_tensorboard=vertex_ai_tb)

In [None]:
# Define experiment parameters
parameters = [
    {"epochs": 1, "lr": 0.01},
    {"epochs": 1, "lr": 0.05},
    {"epochs": 1, "lr": 0.1}, 
]

### Generate UUID to distinguish between experiment runs

(Run this cell again prior to starting additional runs)

In [None]:
import random
import string

# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))

UUID = generate_uuid()
print(UUID)

In [None]:
for i, params in enumerate(parameters):
    vertex_ai.start_run(run=f"pytorch-cifar10-exp-{UUID}-{i}")
    trainer(epochs=params["epochs"], 
                        lr=params["lr"]) 
    vertex_ai.log_params(params)
    accuracy = evaluate()
    print(f"Accuracy from evaluate functions is {accuracy}")
    vertex_ai.log_metrics({"accuracy": accuracy})
    vertex_ai.end_run()

In [None]:
experiment_df = vertex_ai.get_experiment_df()
experiment_df.T

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

plt.rcParams["figure.figsize"] = [15, 10]

ax = pd.plotting.parallel_coordinates(
    experiment_df.reset_index(level=0),
    "run_name",
    cols=[
        "param.epochs",
        "param.lr",
    ],
    color=["blue", "green", "pink", "red"],
)
ax.set_yscale("symlog")
ax.legend(bbox_to_anchor=(1.0, 0.5))

### Cleaning up

To delete the resources used, you can delete the entire Cloud project, or the individual pieces by uncommenting the lines of code below and changing `delete bucket` to `True`. 

In [None]:
# Delete experiment
# exp = vertex_ai.Experiment(EXPERIMENT_NAME)
# exp.delete(delete_backing_tensorboard_runs=True)

# Delete Tensorboard
# vertex_ai_tb.delete()

# Delete Cloud Storage objects that were created
# delete_bucket = False

# if delete_bucket or os.getenv("IS_TESTING"):
#     ! gsutil rm -rf {BUCKET_URI}