In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Parameter
from torch import Tensor
from torch.utils.data import DataLoader

import torchvision.transforms as transforms
import torchvision.datasets as dataset

import math

## data preprocessing

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cuda = True if torch.cuda.is_available() else False

Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
torch.manual_seed(29)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(29)

In [8]:
transform = transforms.Compose([
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, ), (1.0, ))
])

In [9]:
data_root = './data'

train_dataset = dataset.MNIST(data_root, transform=transform, train=True, download=True)
val_dataset = dataset.MNIST(data_root, transform=transform, train=False, download=True)
test_dataset = dataset.MNIST(data_root, transform=transform, train=False, download=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:01<00:00, 8240431.20it/s] 


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 14420915.93it/s]

Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 4297813.71it/s]


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<?, ?it/s]

Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw






In [10]:
batch_size = 64
dataloaders = {}

dataloaders['train'] = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
dataloaders['val'] = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
dataloaders['test'] = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [11]:
batch_size = 100
n_iters = 6000
num_epochs = 10

## modeling


In [69]:
class GRU_cell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True):
        super(GRU_cell, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.x2h = nn.Linear(input_size, 3 * hidden_size, bias=bias)
        self.h2h = nn.Linear(hidden_size, 3 * hidden_size, bias=bias)
        self.reset_parameters()
    
    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)
    

    def forward(self, x, hidden):
        x = x.view(-1, x.size(1))

        gate_x = self.x2h(x)
        gate_h = self.h2h(hidden)
        
        gate_x = gate_x.squeeze()
        gate_h = gate_h.squeeze()

        i_r, i_i, i_n = gate_x.chunk(3, 1)
        h_r, h_i, h_n = gate_h.chunk(3, 1)

        resetgate = F.sigmoid(i_r + h_r)
        inputgate = F.sigmoid(i_i + h_i)
        newgate = F.tanh(i_n + (resetgate * h_n))

        hy = newgate + inputgate * (hidden - newgate)

        return hy

In [70]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, bias=True):
        super(GRU, self).__init__()

        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.gru_cell = GRU_cell(input_dim, hidden_dim, layer_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        if torch.cuda.is_available():
            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
        else:
            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
        
        outs = []
        hn = h0[0, :, :]

        for seq in range(x.size(1)):
            hn = self.gru_cell(x[:, seq, :], hn)
            outs.append(hn)
            
        out = outs[-1].squeeze()
        out = self.fc(out)

        return out

In [71]:
input_dim = 28
hidden_dim = 128
layer_dim = 1
output_dim = 10

model = GRU(input_dim=input_dim, hidden_dim=hidden_dim, layer_dim=layer_dim, output_dim=output_dim)

if torch.cuda.is_available():
    model.cuda()

criterion = nn.CrossEntropyLoss()
lr = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

In [72]:
seq_dim = 28
loss_list = []
iter = 0

for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(dataloaders['train']):
        if torch.cuda.is_available():
            images = Variable(images.view(-1, seq_dim, input_dim).cuda())
            labels = Variable(labels.cuda())
        else:
            images = Variable(images.view(-1, seq_dim, input_dim))
            labels = Variable(labels)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)

        if torch.cuda.is_available():
            loss.cuda()
        
        loss.backward()
        optimizer.step()

        loss_list.append(loss.item())
        iter += 1

        if iter % 500 == 0:
            correct = 0
            total = 0

            for images, labels in dataloaders['val']:
                if torch.cuda.is_available():
                    images = Variable(images.view(-1, seq_dim, input_dim).cuda())
                else:
                    images = Variable(images.view(-1, seq_dim, input_dim))
                
                outputs = model(images)
                _, preds = torch.max(outputs.data, 1)
                total += labels.size(0)

                if torch.cuda.is_available():
                    correct += (preds.cpu() == labels.cpu()).sum()
                else:
                    correct += (preds == labels).sum()
            
            accuracy = 100 * correct / total

            print(f'Iteration: {iter}, Loss: {loss.item()}, Accuracy: {accuracy}')

Iteration: 500, Loss: 0.14868347346782684, Accuracy: 94.56999969482422
Iteration: 1000, Loss: 0.11673645675182343, Accuracy: 96.80999755859375
Iteration: 1500, Loss: 0.14605703949928284, Accuracy: 97.62999725341797
Iteration: 2000, Loss: 0.011644669808447361, Accuracy: 98.41000366210938
Iteration: 2500, Loss: 0.0691923201084137, Accuracy: 97.91999816894531
Iteration: 3000, Loss: 0.09894999116659164, Accuracy: 97.75
Iteration: 3500, Loss: 0.03623463958501816, Accuracy: 98.66000366210938
Iteration: 4000, Loss: 0.07533424347639084, Accuracy: 98.76000213623047
Iteration: 4500, Loss: 0.21248973906040192, Accuracy: 98.66000366210938
Iteration: 5000, Loss: 0.00031871243845671415, Accuracy: 98.75
Iteration: 5500, Loss: 0.011565646156668663, Accuracy: 98.81999969482422
Iteration: 6000, Loss: 0.008013850077986717, Accuracy: 98.7300033569336
Iteration: 6500, Loss: 0.001384919392876327, Accuracy: 98.61000061035156
Iteration: 7000, Loss: 0.006745252758264542, Accuracy: 98.94999694824219
Iteration: 

## eval & test

In [73]:
def evaluate(model, dataloaders):
    corrects, total, total_loss = 0, 0, 0
    model.eval()

    for images, labels in dataloaders:
        if torch.cuda.is_available():
            images = Variable(images.view(-1, seq_dim, input_dim).cuda())
        else:
            images = Variable(images.view(-1, seq_dim, input_dim)).to(DEVICE)
        
        logit = model(images).to(DEVICE)
        loss = F.cross_entropy(logit, labels, reduction='sum')
        _, preds = torch.max(logit, 1)
        total += labels.size(0)
        total_loss += loss.item()
        corrects += (preds == labels).sum()
    
    avg_loss = total_loss / len(dataloaders.dataset)
    avg_accuracy = corrects / total

    return avg_loss, avg_accuracy

In [74]:
test_loss, test_acc = evaluate(model=model, dataloaders=dataloaders['test'])
print(f'Test Loss : {test_loss} | Test Accuracy : {test_acc}')

Test Loss : 0.04065487438539276 | Test Accuracy : 0.9886999726295471
