In [None]:
# %matplotlib inline
# !pip install matplotlib
# !pip install smdebug -q

In [1]:
import torch
from torch import optim
import torchvision
import torchvision.transforms as transforms
import time

In [2]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [3]:
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [5]:
import smdebug.pytorch as smd

job_name = 'pytorch-exp' + time.strftime("%m-%d-%H-%M-%S", time.gmtime())
hook = smd.Hook(out_dir=f'./smd_outputs/{job_name}',
               save_config=smd.SaveConfig(save_interval=10),
               include_collections=['gradients', 'biases'])

hook.register_module(net)
hook.register_loss(criterion)

[2020-03-30 05:53:52.762 ip-172-31-28-67:31414 INFO hook.py:215] Saving to ./smd_outputs/pytorch-exp03-30-05-53-52


In [7]:
for epoch in range(1):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        
        hook.record_tensor_value(tensor_name="loss", tensor_value=loss)
        
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

[2020-03-30 05:54:16.361 ip-172-31-28-67:31414 INFO hook.py:351] Monitoring the collections: biases, losses, gradients
[1,  2000] loss: 2.169
[1,  4000] loss: 1.872
[1,  6000] loss: 1.675
[1,  8000] loss: 1.587
[1, 10000] loss: 1.500
[1, 12000] loss: 1.472
Finished Training


In [8]:
trial = smd.create_trial(path=f'./smd_outputs/{job_name}')
trial.tensor_names()

[2020-03-30 06:02:17.108 ip-172-31-28-67:31414 INFO local_trial.py:35] Loading trial pytorch-exp03-30-05-53-52 at path ./smd_outputs/pytorch-exp03-30-05-53-52


['CrossEntropyLoss_output_0',
 'Net_conv1.bias',
 'Net_conv2.bias',
 'Net_fc1.bias',
 'Net_fc2.bias',
 'Net_fc3.bias',
 'gradient/Net_conv1.bias',
 'gradient/Net_conv1.weight',
 'gradient/Net_conv2.bias',
 'gradient/Net_conv2.weight',
 'gradient/Net_fc1.bias',
 'gradient/Net_fc1.weight',
 'gradient/Net_fc2.bias',
 'gradient/Net_fc2.weight',
 'gradient/Net_fc3.bias',
 'gradient/Net_fc3.weight',
 'loss_output_0']

In [None]:
hook.get_collection('gradients')

In [None]:
trial.tensor('loss_output_0').values()