In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.autograd import Variable
import time

In [2]:
train_dataset = datasets.MNIST(root='./data',
                               train=True,
                              transform=transforms.ToTensor(),
                              download=True)
test_dataset = datasets.MNIST(root='./data',
                               train=False,
                              transform=transforms.ToTensor(),
                              download=True)

In [3]:
batch_size = 100
epochs = 5
iterations = epochs * len(train_dataset)/batch_size
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size,
                                          shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size,
                                          shuffle=False)

In [8]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
        c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
        out, (hn, cn) = self.lstm(x, (h0, c0))
        out = self.fc(out[:,-1,:])
        return out

In [9]:
input_dim = 28
hidden_dim = 100
layer_dim = 1
output_dim = 10

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model.cuda()

LSTMModel(
  (lstm): LSTM(28, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=10, bias=True)
)

In [13]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [14]:
seq_dim =28
itern = 0
start_time = time.time()
for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        
        images = Variable(images.view(-1, seq_dim, input_dim).cuda())
        labels = Variable(labels.cuda())
        
        optimizer.zero_grad()
        
        outputs = model(images)
        
        loss = criterion(outputs, labels)
        loss_val = loss.data.cpu().numpy().reshape(1)[0]
        loss.backward()
        
        optimizer.step()
        
        itern += 1
        if itern%500 ==0:
            correct = 0
            total = 0
            for test_images, test_labels in test_loader:
                test_images = Variable(test_images.view(-1, seq_dim, input_dim).cuda())
                test_preds = model(test_images)
                _, predicted = torch.max(test_preds.data, 1)  
                predicted = predicted.cpu()
                total +=  test_labels.size(0)
                correct += (predicted == test_labels).sum()
            correct = correct.numpy().reshape(1)[0]
            accuracy = 100 * correct/total    
            print('Epoch {}, Iter {}, Loss {}, Accuracy {}'.format(epoch, itern,loss_val, accuracy))
print('Training time {}'.format(time.time() - start_time))

Epoch 0, Iter 500, Loss 1.0740976333618164, Accuracy 70.08
Epoch 1, Iter 1000, Loss 0.5941503643989563, Accuracy 85.28
Epoch 2, Iter 1500, Loss 0.2734034061431885, Accuracy 93.23
Epoch 3, Iter 2000, Loss 0.20964401960372925, Accuracy 94.47
Epoch 4, Iter 2500, Loss 0.32857441902160645, Accuracy 94.2
Epoch 4, Iter 3000, Loss 0.16948994994163513, Accuracy 95.86
Training time 55.67040300369263


In [15]:
input_dim = 28
hidden_dim = 100
layer_dim = 2
output_dim = 10

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model.cuda()

criterion = nn.CrossEntropyLoss()
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

seq_dim =28
itern = 0
start_time = time.time()
for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        
        images = Variable(images.view(-1, seq_dim, input_dim).cuda())
        labels = Variable(labels.cuda())
        
        optimizer.zero_grad()
        
        outputs = model(images)
        
        loss = criterion(outputs, labels)
        loss_val = loss.data.cpu().numpy().reshape(1)[0]
        loss.backward()
        
        optimizer.step()
        
        itern += 1
        if itern%500 ==0:
            correct = 0
            total = 0
            for test_images, test_labels in test_loader:
                test_images = Variable(test_images.view(-1, seq_dim, input_dim).cuda())
                test_preds = model(test_images)
                _, predicted = torch.max(test_preds.data, 1)  
                predicted = predicted.cpu()
                total +=  test_labels.size(0)
                correct += (predicted == test_labels).sum()
            correct = correct.numpy().reshape(1)[0]
            accuracy = 100 * correct/total    
            print('Epoch {}, Iter {}, Loss {}, Accuracy {}'.format(epoch, itern,loss_val, accuracy))
print('Training time {}'.format(time.time() - start_time))

Epoch 0, Iter 500, Loss 2.2955307960510254, Accuracy 11.37
Epoch 1, Iter 1000, Loss 1.7881383895874023, Accuracy 35.77
Epoch 2, Iter 1500, Loss 0.8789016604423523, Accuracy 70.85
Epoch 3, Iter 2000, Loss 0.2187109738588333, Accuracy 92.69
Epoch 4, Iter 2500, Loss 0.13512539863586426, Accuracy 95.09
Epoch 4, Iter 3000, Loss 0.22372448444366455, Accuracy 96.19
Training time 68.01184701919556


In [16]:
input_dim = 28
hidden_dim = 100
layer_dim = 3
output_dim = 10

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model.cuda()

criterion = nn.CrossEntropyLoss()
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

seq_dim =28
itern = 0
start_time = time.time()
for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        
        images = Variable(images.view(-1, seq_dim, input_dim).cuda())
        labels = Variable(labels.cuda())
        
        optimizer.zero_grad()
        
        outputs = model(images)
        
        loss = criterion(outputs, labels)
        loss_val = loss.data.cpu().numpy().reshape(1)[0]
        loss.backward()
        
        optimizer.step()
        
        itern += 1
        if itern%500 ==0:
            correct = 0
            total = 0
            for test_images, test_labels in test_loader:
                test_images = Variable(test_images.view(-1, seq_dim, input_dim).cuda())
                test_preds = model(test_images)
                _, predicted = torch.max(test_preds.data, 1)  
                predicted = predicted.cpu()
                total +=  test_labels.size(0)
                correct += (predicted == test_labels).sum()
            correct = correct.numpy().reshape(1)[0]
            accuracy = 100 * correct/total    
            print('Epoch {}, Iter {}, Loss {}, Accuracy {}'.format(epoch, itern,loss_val, accuracy))
print('Training time {}'.format(time.time() - start_time))

Epoch 0, Iter 500, Loss 2.2935009002685547, Accuracy 11.35
Epoch 1, Iter 1000, Loss 2.302536725997925, Accuracy 11.81
Epoch 2, Iter 1500, Loss 1.9421212673187256, Accuracy 33.4
Epoch 3, Iter 2000, Loss 0.6727659106254578, Accuracy 71.84
Epoch 4, Iter 2500, Loss 0.4518503248691559, Accuracy 89.01
Epoch 4, Iter 3000, Loss 0.21720018982887268, Accuracy 92.98
Training time 81.64413857460022
