In [4]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from tqdm import tqdm

print(torch.__version__)

1.10.2


In [23]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print (device)

cpu


In [6]:
# dataset

mnist_train = datasets.MNIST(root='./mnist/',
                             train=True,
                             transform=transforms.ToTensor(),
                             download=True)
mnist_test = datasets.MNIST(root='./mnist/',
                            train=False,
                            transform=transforms.ToTensor(),
                            download=True)

print(mnist_train)
print(mnist_test)

Dataset MNIST
    Number of datapoints: 60000
    Root location: ./mnist/
    Split: Train
    StandardTransform
Transform: ToTensor()
Dataset MNIST
    Number of datapoints: 10000
    Root location: ./mnist/
    Split: Test
    StandardTransform
Transform: ToTensor()


In [7]:
# dataloader

BATCH_SIZE = 256

train_iter = torch.utils.data.DataLoader(mnist_train,batch_size=BATCH_SIZE,shuffle=True,num_workers=1)
test_iter = torch.utils.data.DataLoader(mnist_test,batch_size=BATCH_SIZE,shuffle=True,num_workers=1)


In [9]:
# model

class RNNClass(nn.Module):
    
    def __init__(self, 
                 name='rnn', 
                 xdim=28, 
                 hdim=256, 
                 ydim=10, 
                 n_layers=3):
        super(RNNClass, self).__init__()
        self.name = name
        self.xdim = xdim
        self.hdim = hdim
        self.ydim = ydim
        self.n_layers = n_layers
        
        self.rnn = nn.LSTM(input_size=self.xdim, 
                           hidden_size=self.hdim, 
                           num_layers=self.n_layers, 
                           batch_first=True)
        self.lin = nn.Linear(self.hdim, self.ydim)
                
    def forward(self, x):
        
        h0 = torch.zeros(self.n_layers, x.size(0), self.hdim).to(device)
        c0 = torch.zeros(self.n_layers, x.size(0), self.hdim).to(device)
        
        rnn_out, (hn, cn) = self.rnn(x, (h0, c0))
        
        out = self.lin(rnn_out[:, -1:]).view([-1, self.ydim])
        return out
    
R = RNNClass(name='rnn', 
             xdim=28, 
             hdim=256, 
             ydim=10,
             n_layers=2).to(device)

loss = nn.CrossEntropyLoss()

optm = optim.Adam(R.parameters(), lr=1e-3)  

In [10]:
# check parameters

n_param = 0

for param_name, param in R.named_parameters():
    
    n_param += len(param.reshape(-1))
    
    print(param_name, param.shape)
    
print(n_param)

rnn.weight_ih_l0 torch.Size([1024, 28])
rnn.weight_hh_l0 torch.Size([1024, 256])
rnn.bias_ih_l0 torch.Size([1024])
rnn.bias_hh_l0 torch.Size([1024])
rnn.weight_ih_l1 torch.Size([1024, 256])
rnn.weight_hh_l1 torch.Size([1024, 256])
rnn.bias_ih_l1 torch.Size([1024])
rnn.bias_hh_l1 torch.Size([1024])
lin.weight torch.Size([10, 256])
lin.bias torch.Size([10])
821770


In [11]:
R

RNNClass(
  (rnn): LSTM(28, 256, num_layers=2, batch_first=True)
  (lin): Linear(in_features=256, out_features=10, bias=True)
)

In [14]:
# simple forward pass

np.set_printoptions(precision=3)
torch.set_printoptions(precision=3)

x_numpy = np.random.rand(3, 10, 28)
x_torch = torch.from_numpy(x_numpy).float().to(device)
y_torch = R.forward(x_torch)
y_numpy = y_torch.detach().cpu().numpy()

print (f'x_numpy: {x_numpy.shape}')
print (f'x_torch: {x_torch.shape}')
print (f'y_torch: {y_torch.shape}')
print (f'y_numpy: {y_numpy.shape}')

x_numpy: (3, 10, 28)
x_torch: torch.Size([3, 10, 28])
y_torch: torch.Size([3, 10])
y_numpy: (3, 10)


In [19]:
# eval func

def func_eval(model, data_iter, device):

    with torch.no_grad():
        model.eval()
        # torch.no_grad & model.eval(): evaluation mode
        # turns off dropout and BN
        
        n_correct, n_total = 0, 0
        
        for X, y in data_iter:
            
            y = y.to(device)
            output = model.forward(X.view(-1, 28, 28).to(device))
            _, y_pred = torch.max(output, 1)
            
            n_correct += (y_pred==y).sum().item()
            n_total += X.size(0)
            
        acc = n_correct/n_total
        
        model.train()
        
    return acc

In [20]:
train_acc = func_eval(R, train_iter, device)
test_acc = func_eval(R, test_iter, device)

print(train_acc)
print(test_acc)

0.09871666666666666
0.098


In [22]:
# train

R.train()

EPOCHS = 5
print_every = 1

for epoch in range(EPOCHS):
    
    loss_val_sum = 0
    
    for X, y in tqdm(train_iter):
        
        output = R.forward(X.view(-1, 28, 28).to(device))
        loss_out = loss(output, y.to(device))
        
        optm.zero_grad() # reset grad
        
        loss_out.backward() # backprop
        
        optm.step() # optimizer update
        
        loss_val_sum += loss_out
        
    loss_val_avg = loss_val_sum/len(train_iter)
    
    if ((epoch%print_every)==0) or (epoch==(EPOCH-1)):
        
        train_acc = func_eval(R, train_iter, device)
        test_acc = func_eval(R, test_iter, device)
        
        print(f'epoch: {epoch}, loss: {loss_val_avg}')
        print(f'train_acc: {train_acc}, test_acc: {test_acc}')

  0%|          | 0/235 [00:00<?, ?it/s]

epoch: 0, loss: 0.14054667949676514
train_acc: 0.9698166666666667, test_acc: 0.9674


  0%|          | 0/235 [00:00<?, ?it/s]

epoch: 1, loss: 0.08790548890829086
train_acc: 0.9807, test_acc: 0.9781


  0%|          | 0/235 [00:00<?, ?it/s]

epoch: 2, loss: 0.06268253177404404
train_acc: 0.9793, test_acc: 0.9778


  0%|          | 0/235 [00:00<?, ?it/s]

epoch: 3, loss: 0.05005732923746109
train_acc: 0.9827166666666667, test_acc: 0.9765


  0%|          | 0/235 [00:00<?, ?it/s]

epoch: 4, loss: 0.04145713150501251
train_acc: 0.9890666666666666, test_acc: 0.9847
