In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn 
import torch.nn.functional as F
from torchvision.datasets import MNIST
import torch.optim as optim
import torchvision.transforms as transforms


In [2]:
batch_size = 64
learning_rate = 0.01

In [3]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()

        self.fc1 = nn.Linear(320, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    
    
    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2_drop(self.conv2(x))), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
        
        
        
        
    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1 
        for s in size:
            num_features *= s
        return num_features

In [4]:
net = Net()
print(net)

Net(
  (conv1): Conv2d (1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d (10, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv2_drop): Dropout2d(p=0.5)
  (fc1): Linear(in_features=320, out_features=120)
  (fc2): Linear(in_features=120, out_features=84)
  (fc3): Linear(in_features=84, out_features=10)
)


In [10]:
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])
mnist_train = MNIST(root='./data', download=True, transform=trans, train=True)
mnist_test = MNIST(root='./data', download=True, transform=trans, train=False)

In [11]:
train_loader = torch.utils.data.DataLoader(
                 dataset=mnist_train,
                 batch_size=batch_size,
                 shuffle=True)
test_loader = torch.utils.data.DataLoader(
                dataset=mnist_test,
                batch_size=batch_size,
                shuffle=False)

In [12]:
optimizer = optim.SGD(net.parameters(), lr=learning_rate)

In [13]:
for epoch in range(10):
    
    for batch_idx, (x, target) in enumerate(train_loader):
#         x, target = Variable(x.cuda()), Variable(target.cuda())
        x, target = Variable(x), Variable(target)
        optimizer.zero_grad()
        output = net(x)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        
        if batch_idx % 100 == 0:
            print('--> epoch: {}, batch_index: {}, train_loss: {:.6f}'.format(epoch, batch_idx, loss.data[0]))
            

--> epoch: 0, batch_index: 0, train_loss: nan
--> epoch: 0, batch_index: 100, train_loss: nan
--> epoch: 0, batch_index: 200, train_loss: nan
--> epoch: 0, batch_index: 300, train_loss: nan
--> epoch: 0, batch_index: 400, train_loss: nan
--> epoch: 0, batch_index: 500, train_loss: nan
--> epoch: 0, batch_index: 600, train_loss: nan
--> epoch: 0, batch_index: 700, train_loss: nan
--> epoch: 0, batch_index: 800, train_loss: nan
--> epoch: 0, batch_index: 900, train_loss: nan
--> epoch: 1, batch_index: 0, train_loss: nan
--> epoch: 1, batch_index: 100, train_loss: nan
--> epoch: 1, batch_index: 200, train_loss: nan
--> epoch: 1, batch_index: 300, train_loss: nan
--> epoch: 1, batch_index: 400, train_loss: nan
--> epoch: 1, batch_index: 500, train_loss: nan
--> epoch: 1, batch_index: 600, train_loss: nan
--> epoch: 1, batch_index: 700, train_loss: nan


KeyboardInterrupt: 