In [1]:
import torch
from torch import nn, optim
from torch.autograd import Variable
from torchvision import datasets, transforms


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
batch_size = 100
epochs = 20


In [7]:
train_dataset = datasets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = datasets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

In [8]:
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=batch_size, shuffle=True
)

# valid_loader = torch.utils.data.DataLoader(
#     dataset=valid_dataset, batch_size=batch_size, shuffle=False
# )

test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset, batch_size=batch_size, shuffle=False
)


# coRNN Model

In [None]:
# class coRNNCell(nn.Module):
#     def __init__(self, input_size, hidden_size, dt, gamma, epsilon):
#         super().__init__()
#         self.dt = dt
#         self.gamma = gamma
#         self.epsilon = epsilon
#         self.i2h = nn.Linear(input_size + hidden_size + hidden_size, hidden_size)

#     def forward(self, x, hy, hz):
#         combined_layer = torch.cat((x, hz, hy), 1)
#         hz = hz + self.dt * (
#             torch.tanh(self.i2h(combined_layer)) - self.gamma * hy - self.epsilon * hz
#         )
#         hy = hy + self.dt * hz
# 
#         return hy, hz


# class coRNN(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size, dt, gamma, epsilon):
#         super().__init__()
#         self.hidden_size = hidden_size
#         self.cell = coRNNCell(input_size, hidden_size, dt, gamma, epsilon)
#         self.readout = nn.Linear(hidden_size, output_size)

#     def forward():
#         hy = Variable(torch.zeros(batch_size, hidden_size))
#         hz = Variable


# Vanilla LSTM

In [8]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()

        # ? Hidden layer dimentions
        self.hidden_size = hidden_size

        self.num_layers = num_layers

        # ? shape: (batch, seq, feature)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # ? Readout layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # ? x shape: (batch_size, seq_len, input_size)
        # ? h_0 shape: (num_layers, batch_size, hidden_size)
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device))
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device))

        # ? out shape: (seq_len, batch_size, hidden_size) => (64, 28, 100)
        # ? h_n shape: (num_layers * num_directions, batch_size, hidden_size)
        out, (h_n, c_n) = self.lstm(x, (h_0, c_0))

        out = self.fc(out[:, -1, :])

        return out


In [11]:
input_size = 28
hidden_size = 100
output_size = 10
num_layers = 3

model = LSTMModel(input_size, hidden_size, output_size, num_layers)
model.to(device)

# for param in model.parameters():
#     print(type(param), param.size(), param.data.shape)


LSTMModel(
  (lstm): LSTM(28, 100, num_layers=3, batch_first=True)
  (fc): Linear(in_features=100, out_features=10, bias=True)
)

In [12]:
criterion = nn.CrossEntropyLoss()

In [13]:
learning_rate = 0.001
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)  


In [14]:
seq_size = 28

iter = 0
for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        # * (64, 1, 28, 28) -> (64, 28, 28)
        images = Variable(images.view(-1, seq_size, input_size).to(device))
        labels = Variable(labels.to(device))

        # ? Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # ? Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        iter += 1
        if iter % 500 == 0:
            total, correct = 0, 0

            # Iterate through test dataset
            for images, labels in test_loader:
                images = Variable(images.view(-1, seq_size, input_size).to(device))
                labels = Variable(labels.to(device))

                # Forward pass only to get logits/output
                outputs = model(images)
                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                total += labels.size(0)
                # Total correct predictions
                correct += (predicted.cpu() == labels.cpu()).sum()

            accuracy = 100 * correct / total
            print(f"Epoch: {epoch + 1}, Iteration: {iter}, Loss: {loss.item():.4f}, Accuracy: {accuracy}")


Epoch: 1, Iteration: 500, Loss: 2.3027, Accuracy: 10.279999732971191
Epoch: 2, Iteration: 1000, Loss: 2.3088, Accuracy: 10.279999732971191
Epoch: 3, Iteration: 1500, Loss: 2.3018, Accuracy: 10.279999732971191
Epoch: 4, Iteration: 2000, Loss: 2.2986, Accuracy: 10.279999732971191
Epoch: 5, Iteration: 2500, Loss: 2.3104, Accuracy: 10.279999732971191


# New

In [32]:
train_dataset = datasets.MNIST(
    root="./data", train=True, transform=transforms.ToTensor(), download=True
)

test_dataset = datasets.MNIST(root="./data", train=False, transform=transforms.ToTensor())


In [33]:
batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=batch_size, shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset, batch_size=batch_size, shuffle=False
)


In [34]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        #######################
        #  USE GPU FOR MODEL  #
        #######################

        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(device))
        c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(device))

        # One time step
        out, (hn, cn) = self.lstm(x, (h0, c0))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states!
        out = self.fc(out[:, -1, :])
        # out.size() --> 100, 10
        return out


In [35]:
input_dim = 28
hidden_dim = 100
layer_dim = 3  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 10

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model.to(device)


LSTMModel(
  (lstm): LSTM(28, 100, num_layers=3, batch_first=True)
  (fc): Linear(in_features=100, out_features=10, bias=True)
)

In [36]:
criterion = nn.CrossEntropyLoss()


In [37]:
learning_rate = 0.1

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


In [39]:
# Number of steps to unroll
seq_dim = 28

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as Variable
        images = Variable(images.view(-1, seq_dim, input_dim)).to(device)
        labels = Variable(labels).to(device)

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                #######################
                #  USE GPU FOR MODEL  #
                #######################
                images = Variable(images.view(-1, seq_dim, input_dim)).to(device)

                # Forward pass only to get logits/output
                outputs = model(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted.cpu() == labels.cpu()).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print(
                "Iteration: {}. Loss: {}. Accuracy: {}".format(
                    iter, loss.item(), accuracy
                )
            )


NameError: name 'input_size' is not defined