# RNN Sum 100 - Keras

## Imports

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
# seed for reproducable solutions
np.random.seed(42)

In [4]:
# 10000 samples (8000 train, 2000 test), numbers from 0 to 9, sequences of length 30
samples = 10000
seq_length = 30
epochs = 60
batch_size = 50

## Data creation

In [6]:
def create_data(samples, seq_length):
    x_ = np.random.random_integers(low=0, high=9, size=(samples, seq_length))
    # needs one more dim for keras
    x = x_[:, :, np.newaxis]
    y = np.sum(x_, axis=1) 
    y = y[:,np.newaxis]
    y = y >= 100

    return x, y

# get the numbers from boolean labels
def y_to_nb(y):
    return y.astype(int)

In [8]:
x, y = create_data(samples, seq_length)
x_tr, x_test, y_tr, y_test = train_test_split(x, y, test_size=0.2)

  x_ = np.random.random_integers(low=0, high=9, size=(samples, seq_length))


In [13]:
x_tr.shape

(8000, 30, 1)

## Defining the model

In [17]:
print('Build model...')
model = Sequential()
# 200 lstm cells
model.add(LSTM(200, input_shape=(seq_length,1)))
# sigmoid is important to use it for binary_crossentropy 

# TimeDistributed layer would create output at each step
# but Recurrent layer would have to return_sequence
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Build model...
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 200)               161600    
                                                                 
 dense_2 (Dense)             (None, 1)                 201       
                                                                 
Total params: 161,801
Trainable params: 161,801
Non-trainable params: 0
_________________________________________________________________
None


## Training the model

In [15]:
hist = model.fit(x_tr, y_tr, validation_data=(x_test, y_test), epochs=epochs, batch_size=batch_size)

Epoch 1/60
Epoch 2/60
Epoch 3/60

KeyboardInterrupt: 

## Evaluating the model

In [8]:
scores = model.evaluate(x_test, y_test, verbose=0)
print("Loss: %0.4f, Accuracy: %.2f%%" % (scores[0], scores[1]*100))

Loss: 0.0162, Accuracy: 99.45%


# PyTorch Version

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import torch.nn.init as init

## Numpy Data to Torch DataLoader

In [11]:
x_tr = x_tr.astype(np.float32, copy=False)
tensor_x_tr = torch.Tensor(x_tr)

y_tr = y_tr.astype(np.float32, copy=False)
tensor_y_tr = torch.Tensor(y_tr)

x_test = x_test.astype(np.float32, copy=False)
tensor_x_test = torch.Tensor(x_test)

y_test = y_test.astype(np.float32, copy=False)
tensor_y_test = torch.Tensor(y_test)

# create datasets
train_data = torch.utils.data.TensorDataset(tensor_x_tr, tensor_y_tr) 
test_data = torch.utils.data.TensorDataset(tensor_x_test, tensor_y_test)

train_loader = torch.utils.data.DataLoader(
                dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
                dataset=test_data, batch_size=x_test.__len__(), shuffle=False)

## Model

In [12]:
class LSTMNN(nn.Module):
    # just the layers and other variables
    # computation is defined in forward method
    def __init__(self):
        super().__init__()
            
        # 200 lstm cells
        self.num_units = 200
        
        # one layer, with 200 cells 
        # data is passed with batch_size as first dim
        # easier to handle in forward pass
        self.lstm = nn.LSTM(1, self.num_units, batch_first = True)
        
        # 200 cells map to one output
        self.output = nn.Linear(self.num_units, 1)
        
    # helper method
    def init_hidden(self, batch_size):
        # initial hidden state is empty -> zeros
        return (Variable(torch.zeros(1, batch_size, self.num_units)),
                Variable(torch.zeros(1, batch_size, self.num_units)))

    # computation is done here
    # backward pass (backprop) is calculated automatically
    def forward(self, x):
        # get batch_size from data
        # evaluation of the model done on complete test batch
        batch_size = x.size()[0]

        # initial hidden state (step = 0)
        hidden_0 = self.init_hidden(batch_size)
        
        # pass input and first hidden state, to lstm
        # defined in __init__
        lstm_out, last = self.lstm(x, hidden_0)
        # lstm_out contains all hidden states (at each time step)
        # should be possible to use a dense layer on all steps
        # last (tuple) contains hidden and cell state of last step
        
        # take hidden state at last timestep 
        # dim order is changed due to batch_first = True
        out = self.output(lstm_out[:,-1,:])
        # or
        #out = self.output(last[0][0,:,:]) 
        # both are the same
        
        out = F.sigmoid(out)
        return out



In [13]:
model = LSTMNN()
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-8)

In [14]:
def train(epoch):
    # indicates training
    # useful for e.g. dropout
    # here not necessary but good practice
    model.train()
    for batch_idx, (data, label) in enumerate(train_loader):
        # input has to be a autograd.Variable
        # allows automatic differentiation
        data, label = Variable(data), Variable(label)
        # has to done before each optimizer step 
        # (each backprop)
        # optimizer accumulates the changes
        optimizer.zero_grad()

        output = model(data)
        loss = F.binary_cross_entropy(output, label)
        # calculates the backpropagation started at the loss
        loss.backward()
        
        # updates the parameters of the network
        # by specified optimizer
        optimizer.step()
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data[0]))
            
def test():
    # indicates testing
    # useful for e.g. dropout
    # here not necessary but good practice
    model.eval()
    test_loss = 0
    correct = 0
    for data, label in test_loader:
        # volatile true deactivates
        # the calculation of gradients in backward pass
        # also deactivates all nodes that depend on data 
        # means whole network doesn't calculate gradients
        data, label = Variable(data, volatile=True), Variable(label)
        
        output = model(data)
        test_loss += F.binary_cross_entropy(output, label, 
                        size_average=False).data[0] # sum up batch loss
        pred = output.data.ge(0.5).float() # to 1.0 if >= 0.5 else 0.0
        correct += pred.eq(label.data.view_as(pred)).sum()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [15]:
print("Number of Epochs:", epochs)
for epoch in range(1, epochs+1):
    train(epoch)
    test()

Number of Epochs: 60

Test set: Average loss: 0.0581, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0579, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0584, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0612, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0572, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0425, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0571, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0490, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0568, Accuracy: 1979/2000 (99%)




Test set: Average loss: 0.0527, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0380, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0358, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0148, Accuracy: 1988/2000 (99%)


Test set: Average loss: 0.0263, Accuracy: 1979/2000 (99%)


Test set: Average loss: 0.0207, Accuracy: 1981/2000 (99%)


Test set: Average loss: 0.0220, Accuracy: 1981/2000 (99%)


Test set: Average loss: 0.0103, Accuracy: 1991/2000 (100%)


Test set: Average loss: 0.0078, Accuracy: 1994/2000 (100%)


Test set: Average loss: 0.0114, Accuracy: 1990/2000 (100%)




Test set: Average loss: 0.0187, Accuracy: 1982/2000 (99%)


Test set: Average loss: 0.0086, Accuracy: 1992/2000 (100%)


Test set: Average loss: 0.0171, Accuracy: 1984/2000 (99%)


Test set: Average loss: 0.0135, Accuracy: 1989/2000 (99%)


Test set: Average loss: 0.0091, Accuracy: 1993/2000 (100%)


Test set: Average loss: 0.0107, Accuracy: 1988/2000 (99%)


Test set: Average loss: 0.0095, Accuracy: 1993/2000 (100%)


Test set: Average loss: 0.0069, Accuracy: 1996/2000 (100%)


Test set: Average loss: 0.0107, Accuracy: 1988/2000 (99%)


Test set: Average loss: 0.0142, Accuracy: 1988/2000 (99%)




Test set: Average loss: 0.0095, Accuracy: 1993/2000 (100%)


Test set: Average loss: 0.0098, Accuracy: 1992/2000 (100%)


Test set: Average loss: 0.0073, Accuracy: 1995/2000 (100%)


Test set: Average loss: 0.0113, Accuracy: 1991/2000 (100%)


Test set: Average loss: 0.0181, Accuracy: 1986/2000 (99%)


Test set: Average loss: 0.0097, Accuracy: 1989/2000 (99%)


Test set: Average loss: 0.0152, Accuracy: 1982/2000 (99%)


Test set: Average loss: 0.0133, Accuracy: 1991/2000 (100%)


Test set: Average loss: 0.0066, Accuracy: 1996/2000 (100%)


Test set: Average loss: 0.0063, Accuracy: 1996/2000 (100%)




Test set: Average loss: 0.0095, Accuracy: 1990/2000 (100%)


Test set: Average loss: 0.0081, Accuracy: 1995/2000 (100%)


Test set: Average loss: 0.0231, Accuracy: 1984/2000 (99%)


Test set: Average loss: 0.0105, Accuracy: 1991/2000 (100%)


Test set: Average loss: 0.0161, Accuracy: 1989/2000 (99%)


Test set: Average loss: 0.0104, Accuracy: 1991/2000 (100%)


Test set: Average loss: 0.0092, Accuracy: 1994/2000 (100%)


Test set: Average loss: 0.0139, Accuracy: 1988/2000 (99%)


Test set: Average loss: 0.0141, Accuracy: 1991/2000 (100%)


Test set: Average loss: 0.0124, Accuracy: 1990/2000 (100%)




Test set: Average loss: 0.0093, Accuracy: 1989/2000 (99%)


Test set: Average loss: 0.0141, Accuracy: 1989/2000 (99%)


Test set: Average loss: 0.0084, Accuracy: 1994/2000 (100%)


Test set: Average loss: 0.0115, Accuracy: 1990/2000 (100%)


Test set: Average loss: 0.0181, Accuracy: 1986/2000 (99%)


Test set: Average loss: 0.0269, Accuracy: 1984/2000 (99%)


Test set: Average loss: 0.0116, Accuracy: 1991/2000 (100%)


Test set: Average loss: 0.0191, Accuracy: 1987/2000 (99%)


Test set: Average loss: 0.0147, Accuracy: 1988/2000 (99%)


Test set: Average loss: 0.0120, Accuracy: 1988/2000 (99%)




Test set: Average loss: 0.0173, Accuracy: 1986/2000 (99%)

