In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.datasets as dsets
 

# LSTM implementation

In [7]:
import math

class myLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, layer_dim = 1):
      super().__init__()
      self.input_size = input_size
      self.hidden_size = hidden_size


      '''
      Combining weight parameters for all gates into single matrices for faster calculation
      '''
      self.weight_ih = nn.Parameter(torch.Tensor(input_size, hidden_size * 4))
      self.weight_hh = nn.Parameter(torch.Tensor(hidden_size, hidden_size * 4))
      self.bias = nn.Parameter(torch.Tensor(hidden_size * 4))

      self.init_weights()

      
    '''
    Initialize weights similar to pytorch class
    '''
    def init_weights(self):
      stdv = 1.0 / math.sqrt(self.hidden_size)
      for weight in self.parameters():
        weight.data.uniform_(-stdv, stdv)

    '''
    Feed forward method for lstm

    INPUT x is of shape (batch_size, sequence length, feature dimension) i.e 100,28,28

    OUTPUT size (hidden state) should be (batch_size , sequence length, output size) i.e 100, 28 , 128

    '''

    def forward(self, x, init_states):

      batch_size, seq_size , feature_dim = x.size()

      hidden_seq = []

      # INITIALIZE HIDDEN STATES AND CONTENT FOR FIRST TIME STEP

      if init_states is None:
            h_t, c_t = (torch.zeros(bs,self.hidden_size).to(x.device), 
                        torch.zeros(bs, self.hidden_size).to(x.device))
      else:
            h_t, c_t = init_states


      for t in range(seq_size):
              x_t = x[:, t, :]

              # Do all computations into a single matrix multiplication
              gates = x_t @ self.weight_ih + h_t @ self.weight_hh + self.bias

              #Obtain values for input, forget , update and output gates (ORDER AS PER PYTORCH DEFAULTS)
              i_t, f_t, g_t, o_t = (
                  torch.sigmoid(gates[:, :self.hidden_size]), # input [:,0:128]
                  torch.sigmoid(gates[:, self.hidden_size:self.hidden_size*2]), # forget [:,128:256]
                  torch.tanh(gates[:, self.hidden_size*2:self.hidden_size*3]),  #update [: ,256:384]
                  torch.sigmoid(gates[:, self.hidden_size*3:]), # output [,384:512]
              )

              c_t = f_t * c_t + i_t * g_t  # Update memory content c_t
              h_t = o_t * torch.tanh(c_t)  # Update hidden state
              hidden_seq.append(h_t.unsqueeze(0)) # Append h_t to the hidden sequence
              #print(len(hidden_seq))

      hidden_seq = torch.cat(hidden_seq, dim=0) #Stack hidden sequence outputs 
      #print(hidden_seq.shape)
      hidden_seq = hidden_seq.transpose(0, 1).contiguous() #Transpose batch_size and sequence size
      #print(hidden_seq.shape)


      return hidden_seq, (h_t, c_t)


In [8]:
'''
STEP 1: LOADING DATASET
'''
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)
 
test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

'''
STEP 2: MAKING DATASET ITERABLE
'''
 
batch_size = 100
n_iters = 6000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)
 
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)
 
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)


In [9]:

'''
STEP 3: CREATE MODEL CLASS
'''
 
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, layer_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
         
        # Number of hidden layers
        #self.layer_dim = layer_dim
         
        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)

        self.lstm = myLSTM(input_dim, hidden_dim, layer_dim)
                 
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
     
    def forward(self, x):
        # Initialize hidden state with zeros
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        
        #print(x.shape,"x.shape")100, 28, 28
        if torch.cuda.is_available():
            h0 = torch.zeros(x.size(0), self.hidden_dim).cuda()
        else:
            h0 = torch.zeros(x.size(0), self.hidden_dim)
         
        # Initialize cell state
        if torch.cuda.is_available():
            c0 = torch.zeros(x.size(0), self.hidden_dim).cuda()
        else:
            c0 = torch.zeros(x.size(0), self.hidden_dim)
        
        #Note you can also learn the h0 and c0!
        out, (hn, cn) = self.lstm(x, (h0,c0))#or None!


        # Index hidden state of last time step
        # out.size() --> 100, 28, 128
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10

        return out
 

In [10]:
'''
STEP 4: INSTANTIATE MODEL CLASS
'''
input_dim = 28
hidden_dim = 128
layer_dim = 3  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER, HERE I HAVE NOT INCLUDED ON THE MODEL CAUSE IT BECOMES HARD
output_dim = 10
 
model = LSTMModel(input_dim, hidden_dim, output_dim, layer_dim)
 
#######################
#  USE GPU FOR MODEL  #
#######################
 
if torch.cuda.is_available():
    model.cuda()

In [11]:
model

LSTMModel(
  (lstm): myLSTM()
  (fc): Linear(in_features=128, out_features=10, bias=True)
)

In [12]:
'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss().cuda()

In [13]:
'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.1
 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [14]:
'''
STEP 7: TRAIN THE MODEL
'''

# Number of steps to unroll
seq_dim = 28 
 
iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as Variable
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        if torch.cuda.is_available():
            images = images.view(-1, seq_dim, input_dim).cuda()
            labels = labels.cuda()
        else:
            images = images.view(-1, seq_dim, input_dim)

             
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
         
        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)
         
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
         
        # Getting gradients w.r.t. parameters
        loss.backward()
         
        # Updating parameters
        optimizer.step()
         
        iter += 1
         
        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                if torch.cuda.is_available():
                    images = images.view(-1, seq_dim, input_dim).cuda()

                 
                # Forward pass only to get logits/output
                outputs = model(images)
                 
                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)
                 
                # Total number of labels
                total += labels.size(0)
                 
                # Total correct predictions
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                if torch.cuda.is_available():
                    correct += (predicted.cpu() == labels.cpu()).sum()
                else:
                    correct += (predicted == labels).sum()
             
            accuracy = 100 * correct / total
             
            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))


Iteration: 500. Loss: 2.2723288536071777. Accuracy: 18.920000076293945
Iteration: 1000. Loss: 1.1944184303283691. Accuracy: 59.459999084472656
Iteration: 1500. Loss: 0.6594192385673523. Accuracy: 73.30999755859375
Iteration: 2000. Loss: 0.5093726515769958. Accuracy: 88.20999908447266
Iteration: 2500. Loss: 0.216352179646492. Accuracy: 92.79000091552734
Iteration: 3000. Loss: 0.16063569486141205. Accuracy: 93.87999725341797
Iteration: 3500. Loss: 0.15129585564136505. Accuracy: 96.05999755859375
Iteration: 4000. Loss: 0.1264088898897171. Accuracy: 96.6500015258789
Iteration: 4500. Loss: 0.05556895583868027. Accuracy: 96.5199966430664
Iteration: 5000. Loss: 0.2652735412120819. Accuracy: 97.18000030517578
Iteration: 5500. Loss: 0.08962659537792206. Accuracy: 97.2300033569336
Iteration: 6000. Loss: 0.12946972250938416. Accuracy: 97.58999633789062


# GRU implementation

In [61]:
import math

class myGRU(nn.Module):
    def __init__(self, input_size, hidden_size, layer_dim = 1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size


        '''
        Combining weight parameters for all gates into single matrices for faster calculation
        '''
        self.weight_ih = nn.Parameter(torch.Tensor(input_size, hidden_size * 4))
        self.weight_hh = nn.Parameter(torch.Tensor(hidden_size, hidden_size * 4))
        self.bias = nn.Parameter(torch.Tensor(hidden_size * 4))

        self.init_weights()



    def init_weights(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    
    def forward(self, x, init_states=None):

        """Assumes x is of shape (batch, sequence, feature)"""

        batch_size, seq_size , feature_dim = x.size()

        hidden_seq = []

        if init_states is None:
            h_t, c_t = (torch.zeros(bs,self.hidden_size).to(x.device), 
                        torch.zeros(bs, self.hidden_size).to(x.device))
        else:
            h_t, c_t = init_states
 

        for t in range(seq_size):
            x_t = x[:, t, :]

            # Do all the computations into a single matrix multiplication
            gates = x_t @ self.weight_ih + h_t @ self.weight_hh

            z_t, r_t = (
                torch.sigmoid(gates[:, :self.hidden_size]), # input
                torch.sigmoid(gates[:, self.hidden_size:self.hidden_size*2]), # forget
            )

        
            h_cap = torch.tanh(x_t @ self.weight_ih +  self.weight_hh @ (r_t*h_t) )

            h_t = (1-z_t)*h_t + z_t*h_cap
            hidden_seq.append(h_t.unsqueeze(0))
        hidden_seq = torch.cat(hidden_seq, dim=0)
        # reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
        hidden_seq = hidden_seq.transpose(0, 1).contiguous()

        return hidden_seq, (h_t, c_t)
                   

In [62]:
'''
STEP 1: LOADING DATASET
'''
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)
 
test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())
 
'''
STEP 2: MAKING DATASET ITERABLE
'''
 
batch_size = 100
n_iters = 6000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)
 
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)
 
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

'''
STEP 3: CREATE MODEL CLASS
'''
 
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GRUModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
         
         
        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.gru = myGRU(input_dim, hidden_dim)
                 
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
     
    def forward(self, x):
        # Initialize hidden state with zeros
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        
        #print(x.shape,"x.shape")100, 28, 28
        if torch.cuda.is_available():
            h0 = torch.zeros(x.size(0), self.hidden_dim).cuda()
        else:
            h0 = torch.zeros(x.size(0), self.hidden_dim)
         
        # Initialize cell state
        if torch.cuda.is_available():
            c0 = torch.zeros(x.size(0), self.hidden_dim).cuda()
        else:
            c0 = torch.zeros(x.size(0), self.hidden_dim)
        
        #Note you can also learn the h0 and c0!
        out, (hn, cn) = self.gru(x, (h0,c0))#or None!

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10

        return out


 
'''
STEP 4: INSTANTIATE MODEL CLASS
'''
input_dim = 28
hidden_dim = 100
layer_dim = 1  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 10
 
model = LSTMModel(input_dim, hidden_dim, output_dim, layer_dim)


#######################
#  USE GPU FOR MODEL  #
#######################
 
if torch.cuda.is_available():
    model.cuda()
     
'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss()
 
'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.1
 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  
 

In [68]:
'''
STEP 7: TRAIN THE MODEL
'''
 
# Number of steps to unroll
seq_dim = 28 
 
iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as Variable
        #######################
        #  USE GPU FOR MODEL  #
        #######################
        if torch.cuda.is_available():
            images = images.view(-1, seq_dim, input_dim).cuda()
            labels = labels.cuda()
        else:
            images = images.view(-1, seq_dim, input_dim)

             
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
         
        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)
         
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
         
        # Getting gradients w.r.t. parameters
        loss.backward()
         
        # Updating parameters
        optimizer.step()
         
        iter += 1
         
        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                if torch.cuda.is_available():
                    images = images.view(-1, seq_dim, input_dim).cuda()

                 
                # Forward pass only to get logits/output
                outputs = model(images)
                 
                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)
                 
                # Total number of labels
                total += labels.size(0)
                 
                # Total correct predictions
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                if torch.cuda.is_available():
                    correct += (predicted.cpu() == labels.cpu()).sum()
                else:
                    correct += (predicted == labels).sum()
             
            accuracy = 100 * correct / total
             
            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

Iteration: 500. Loss: 2.264110803604126. Accuracy: 19.3799991607666
Iteration: 1000. Loss: 1.3947224617004395. Accuracy: 58.91999816894531
Iteration: 1500. Loss: 0.47112739086151123. Accuracy: 84.30000305175781
Iteration: 2000. Loss: 0.2999100387096405. Accuracy: 91.75
Iteration: 2500. Loss: 0.21244826912879944. Accuracy: 93.95999908447266
Iteration: 3000. Loss: 0.18665264546871185. Accuracy: 94.18000030517578
Iteration: 3500. Loss: 0.2370314598083496. Accuracy: 95.54000091552734
Iteration: 4000. Loss: 0.189910426735878. Accuracy: 95.01000213623047
Iteration: 4500. Loss: 0.19920867681503296. Accuracy: 96.43000030517578
Iteration: 5000. Loss: 0.11613094061613083. Accuracy: 96.93000030517578
Iteration: 5500. Loss: 0.0599500946700573. Accuracy: 96.94999694824219
Iteration: 6000. Loss: 0.052838120609521866. Accuracy: 96.5199966430664
