In [24]:
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,)),
                              ])
# Download and load the training data
trainset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

In [29]:
# Build a feed-forward network
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.LogSoftmax(dim = 1))

# Define the loss
criterion = nn.NLLLoss()

# Get our data
dataiter = iter(trainloader)

images, labels = next(dataiter)

# Flatten images
images = images.view(images.shape[0], -1)

# Forward pass, get our logits
logits = model(images)
# Calculate the loss with the logits and the labels
loss = criterion(logits, labels)

print(loss)

tensor(2.3238, grad_fn=<NllLossBackward>)


In [30]:
print('Before backward pass: \n', model[0].weight.grad)

loss.backward()

print('After backward pass: \n', model[0].weight.grad)

Before backward pass: 
 None
After backward pass: 
 tensor([[ 1.0019e-03,  1.0019e-03,  1.0019e-03,  ...,  1.0019e-03,
          1.0019e-03,  1.0019e-03],
        [-2.1058e-03, -2.1058e-03, -2.1058e-03,  ..., -2.1058e-03,
         -2.1058e-03, -2.1058e-03],
        [-3.6505e-04, -3.6505e-04, -3.6505e-04,  ..., -3.6505e-04,
         -3.6505e-04, -3.6505e-04],
        ...,
        [ 4.4715e-03,  4.4715e-03,  4.4715e-03,  ...,  4.4715e-03,
          4.4715e-03,  4.4715e-03],
        [-7.3721e-05, -7.3721e-05, -7.3721e-05,  ..., -7.3721e-05,
         -7.3721e-05, -7.3721e-05],
        [ 3.0158e-03,  3.0158e-03,  3.0158e-03,  ...,  3.0158e-03,
          3.0158e-03,  3.0158e-03]])


In [31]:
from torch import optim

In [32]:
# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [33]:
print('Initial weights - ', model[0].weight)

dataiter = iter(trainloader)
images, labels = next(dataiter)
images.resize_(64, 784)

# Clear the gradients, do this because gradients are accumulated
optimizer.zero_grad()

# Forward pass, then backward pass, then update weights
output = model(images)
loss = criterion(output, labels)
loss.backward()
print('Gradient -', model[0].weight.grad)

Initial weights -  Parameter containing:
tensor([[ 0.0185, -0.0038, -0.0354,  ...,  0.0276, -0.0137, -0.0069],
        [ 0.0116,  0.0357, -0.0239,  ..., -0.0078,  0.0350,  0.0218],
        [ 0.0039, -0.0308, -0.0039,  ...,  0.0104, -0.0150, -0.0342],
        ...,
        [ 0.0053, -0.0053,  0.0283,  ..., -0.0196, -0.0076, -0.0066],
        [-0.0248,  0.0187,  0.0202,  ...,  0.0248,  0.0157,  0.0241],
        [-0.0028,  0.0201, -0.0145,  ..., -0.0177,  0.0308,  0.0242]],
       requires_grad=True)
Gradient - tensor([[-0.0001, -0.0001, -0.0001,  ..., -0.0001, -0.0001, -0.0001],
        [-0.0005, -0.0005, -0.0005,  ..., -0.0005, -0.0005, -0.0005],
        [-0.0008, -0.0008, -0.0008,  ..., -0.0008, -0.0008, -0.0008],
        ...,
        [-0.0014, -0.0014, -0.0014,  ..., -0.0014, -0.0014, -0.0014],
        [-0.0012, -0.0012, -0.0012,  ..., -0.0012, -0.0012, -0.0012],
        [ 0.0023,  0.0023,  0.0023,  ...,  0.0023,  0.0023,  0.0023]])


In [34]:
# Take an update step and view the new weights
optimizer.step()
print('Updated weights - ', model[0].weight)

Updated weights -  Parameter containing:
tensor([[ 0.0185, -0.0038, -0.0354,  ...,  0.0276, -0.0137, -0.0069],
        [ 0.0116,  0.0357, -0.0239,  ..., -0.0078,  0.0350,  0.0218],
        [ 0.0039, -0.0308, -0.0038,  ...,  0.0104, -0.0150, -0.0342],
        ...,
        [ 0.0053, -0.0053,  0.0283,  ..., -0.0196, -0.0076, -0.0065],
        [-0.0247,  0.0187,  0.0202,  ...,  0.0248,  0.0158,  0.0241],
        [-0.0028,  0.0201, -0.0145,  ..., -0.0177,  0.0307,  0.0241]],
       requires_grad=True)


In [35]:
model = nn.Sequential(nn.Linear(784, 128),
                      nn.ReLU(),
                      nn.Linear(128, 64),
                      nn.ReLU(),
                      nn.Linear(64, 10),
                      nn.LogSoftmax(dim=1))

criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.003)

epochs = 5
for e in range(epochs):
    running_loss = 0
    for images, labels in trainloader:
        # Flatten MNIST images into a 784 long vector
        images = images.view(images.shape[0], -1)
    
        # TODO: Training pass
        output = model(images)
        loss = criterion(output, labels)
        loss.backward()
        print('Gradient -', model[0].weight.grad)
        
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss/len(trainloader)}")

Gradient - tensor([[-0.0015, -0.0015, -0.0015,  ..., -0.0015, -0.0015, -0.0015],
        [-0.0036, -0.0036, -0.0036,  ..., -0.0036, -0.0036, -0.0036],
        [ 0.0019,  0.0019,  0.0019,  ...,  0.0019,  0.0019,  0.0019],
        ...,
        [-0.0001, -0.0001, -0.0001,  ..., -0.0001, -0.0001, -0.0001],
        [ 0.0018,  0.0018,  0.0018,  ...,  0.0018,  0.0018,  0.0018],
        [ 0.0010,  0.0010,  0.0010,  ...,  0.0010,  0.0010,  0.0010]])
Gradient - tensor([[-4.0635e-03, -4.0635e-03, -4.0635e-03,  ..., -4.0635e-03,
         -4.0635e-03, -4.0635e-03],
        [-7.0165e-03, -7.0165e-03, -7.0165e-03,  ..., -7.0165e-03,
         -7.0165e-03, -7.0165e-03],
        [ 3.3591e-03,  3.3591e-03,  3.3591e-03,  ...,  3.3591e-03,
          3.3591e-03,  3.3591e-03],
        ...,
        [ 6.0839e-03,  6.0839e-03,  6.0839e-03,  ...,  6.0839e-03,
          6.0839e-03,  6.0839e-03],
        [ 7.2048e-05,  7.2048e-05,  7.2048e-05,  ...,  7.2048e-05,
          7.2048e-05,  7.2048e-05],
        [-1.9775

In [37]:
%matplotlib inline
import helper

dataiter = iter(trainloader)
images, labels = next(dataiter)

img = images[0].view(1, 784)
# Turn off gradients to speed up this part
with torch.no_grad():
    logps = model(img)

# Output of the network are log-probabilities, need to take exponential for probabilities
ps = torch.exp(logps)
helper.view_classify(img.view(1, 28, 28), ps)

AttributeError: module 'helper' has no attribute 'view_classify'