In [1]:
import math
import random

In [2]:
# log loss because binary outputs
# sigmoid as activation funciton for hidden layers

In [3]:
def log_loss(y_hat, y):
    """
    y_hat is the prediction
    y is the true value
    """
    if y == 1:
        return -math.log(y_hat)
    elif y == 0:
        return -math.log(1-y_hat)
    else:
        raise InvalidInputError("y must be 0 or 1 but it is ", y)

In [4]:
def sigmoid(z):
    return 1 / (1 + (math.e**-z))

## Derivatives

In [5]:
def by_derivative(y_hat, y):
    """
    y bias derivative with respect to log loss
    """
    return (y_hat-y)

In [6]:
def Wy_derivative(y_hat, y, a):
    """
    y weights derivative with respect to log loss
    """
    return by_derivative(y_hat,y)*a

In [7]:
def ba_derivative(y_hat, y, Wy, an):
    """
    x bias derivative
    """
    return (y_hat-y)*Wy*an*(1-an)

In [8]:
def Waa_derivative(y_hat, y, Wy, an, a_prev):
    return ba_derivative(y_hat, y, Wy, an) * a_prev

def Wax_derivative(y_hat, y, Wy, an, x):
    return ba_derivative(y_hat, y, Wy, an) * x

## Sample data

In [9]:
X = []

for i in range(1000):
    sequence_length = random.randint(4,10)
    
    current_sequence = []
    for j in range(sequence_length):
        current_sequence.append(random.choice([0,1]))
        
    X.append(current_sequence)

In [10]:
y = []
for sequence in X:
    shifted_sequence = [0] + sequence[:-1]
    y.append(shifted_sequence)

In [11]:
X[:10]

[[0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 1, 1, 0, 1, 0, 0, 0],
 [0, 1, 0, 1, 1, 1],
 [1, 1, 0, 1, 1, 0],
 [0, 0, 1, 1, 1, 0, 1],
 [1, 0, 0, 0, 0, 1, 0],
 [1, 1, 1, 1, 1, 0, 0, 0, 1],
 [0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 1, 1, 0, 0]]

In [12]:
y[:10]

[[0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 1, 1, 0, 1, 0, 0],
 [0, 0, 1, 0, 1, 1],
 [0, 1, 1, 0, 1, 1],
 [0, 0, 0, 1, 1, 1, 0],
 [0, 1, 0, 0, 0, 0, 1],
 [0, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 1, 1, 1, 1, 1, 0, 0],
 [0, 0, 1, 1, 0]]

## Define the RNN

In [18]:
class RNN:
    def __init__(self):
        self.Wy_first = random.gauss(0, 0.2)
        self.Wy_second = random.gauss(0, 0.2)
        self.by = random.gauss(0, 0.2)
        
        self.Waa_first_1 = random.gauss(0, 0.2)
        self.Waa_first_2 = random.gauss(0, 0.2)
        self.ba_first = random.gauss(0, 0.2)
        
        self.Waa_second_1 = random.gauss(0, 0.2)
        self.Waa_second_2 = random.gauss(0, 0.2)
        self.ba_second = random.gauss(0, 0.2)
        
        self.Wax_first = random.gauss(0, 0.2)
        self.Wax_second = random.gauss(0, 0.2)

        self.a_first = 0
        self.a_second = 0
        
        # an example of correct output parameters
        self.Wy_first = 0 
        self.Wy_second = 10 # correct
        self.by = -5 # correct
        
        self.Waa_first_1 = 0
        self.Waa_first_2 = 0
        self.ba_first = 5 # -5
        
        self.Waa_second_1 = 10
        self.Waa_second_2 = 0
        self.ba_second = -5
        
        self.Wax_first = 10
        self.Wax_second = 0
        

        
        
    def predict(self, x_input):
        """
        x_input is a list
        """
        output = []
        
        # set the initial value for a
        current_a_first = self.a_first
        current_a_second = self.a_second
        
        for x in x_input:
            current_y_hat, current_a_first, current_a_second = self.run(x, current_a_first, current_a_second)
            
            output.append(current_y_hat)
            
        return output
    
    
    def run(self, x, a_first, a_second):
        """
        run the input through the rnn to get the an and y_hat 
        """
        current_a_first = sigmoid(self.Waa_first_1*a_first + self.Waa_first_2*a_second + self.Wax_first*x+ self.ba_first)
        current_a_second = sigmoid(self.Waa_second_1*a_first + self.Waa_second_2*a_second + self.Wax_second*x+ self.ba_second)
        
        current_y_hat = sigmoid(self.Wy_first*current_a_first + self.Wy_second*current_a_second +self.by)
        
        return current_y_hat, current_a_first, current_a_second
    
    
    def forward_pass(self, x, y, a_first, a_second):
        """
        do a single forward pass of the training phase and calculate gradients
        
        this forward pass represents a single time step of the RNN
        """
        
        Wy_first_gradient = None # done
        Wy_second_gradient = None # done
        by_gradient = None # done
        
        Waa_first_1_gradient = None # done
        Waa_first_2_gradient = None # done
        ba_first_gradient = None # done
        
        Waa_second_1_gradient = None
        Waa_second_2_gradient = None
        ba_second_gradient = None # done
        
        Wax_first_gradient = None # done
        Wax_second_gradient = None
        
        # predict the y_hat and activations a
        y_hat, current_a_first, current_a_second = self.run(x, a_first, a_second)
        
        by_gradient = by_derivative(y_hat, y)
        Wy_first_gradient = Wy_derivative(y_hat, y, current_a_first)
        Wy_second_gradient = Wy_derivative(y_hat, y, current_a_second)
        
        ba_first_gradient = ba_derivative(y_hat, y, self.Wy_first, current_a_first)
        Wax_first_gradient = Waa_derivative(y_hat, y, self.Wy_first, current_a_first, x)
        Waa_first_1_gradient = Waa_derivative(y_hat, y, self.Wy_first, current_a_first, a_first)
        Waa_first_2_gradient = Waa_derivative(y_hat, y, self.Wy_first, current_a_first, a_second)
        
        ba_second_gradient = ba_derivative(y_hat, y, self.Wy_second, current_a_second)
        Wax_second_gradient = Waa_derivative(y_hat, y, self.Wy_second, current_a_second, x)
        Waa_second_1_gradient = Waa_derivative(y_hat, y, self.Wy_second, current_a_second, a_first)
        Waa_second_2_gradient = Waa_derivative(y_hat, y, self.Wy_second, current_a_second, a_second)
        
        gradients = (Wy_first_gradient, Wy_second_gradient, by_gradient, 
                     Waa_first_1_gradient, Waa_first_2_gradient, ba_first_gradient, 
                     Waa_second_1_gradient, Waa_second_2_gradient, ba_second_gradient, 
                     Wax_first_gradient, Wax_second_gradient)
        
        return gradients, current_a_first, current_a_second, y_hat

    
    
    def update_weights(self, x_input, y_input, learning_rate=0.0001, print_loss=False):
        """
        calculate the gradients and update the weights for a single datapoint
        """
        a1 = self.a_first
        a2 = self.a_second
        
        Wy_first_gradient_sum = 0
        Wy_second_gradient_sum = 0
        by_gradient_sum = 0
        
        Waa_first_1_gradient_sum = 0
        Waa_first_2_gradient_sum = 0
        ba_first_gradient_sum = 0
        
        Waa_second_1_gradient_sum = 0
        Waa_second_2_gradient_sum = 0
        ba_second_gradient_sum = 0
        
        Wax_first_gradient_sum = 0
        Wax_second_gradient_sum = 0
        
        loss = 0
        
        for i in range(len(x_input)):
            x = x_input[i]
            y = y_input[i]
            gradients, a1, a2, y_hat = self.forward_pass(x, y, a1, a2)
            
#             print("gradients", gradients)
            (Wy_first_gradient, Wy_second_gradient, by_gradient, 
                     Waa_first_1_gradient, Waa_first_2_gradient, ba_first_gradient, 
                     Waa_second_1_gradient, Waa_second_2_gradient, ba_second_gradient, 
                     Wax_first_gradient, Wax_second_gradient) = gradients
            
#             print("by_gradient", by_gradient)
            # sum up the gradients for the entire sequence
            Wy_first_gradient_sum += Wy_first_gradient
            Wy_second_gradient_sum += Wy_second_gradient
            by_gradient_sum += by_gradient
            Waa_first_1_gradient_sum += Waa_first_1_gradient
            Waa_first_2_gradient_sum += Waa_first_2_gradient
            ba_first_gradient_sum += ba_first_gradient
            Waa_second_1_gradient_sum += Waa_second_1_gradient
            Waa_second_2_gradient_sum += Waa_second_2_gradient
            ba_second_gradient_sum += ba_second_gradient
            Wax_first_gradient_sum += Wax_first_gradient
            Wax_second_gradient_sum += Wax_second_gradient
            
            loss += log_loss(y_hat, y)
        
        # print the loss for the individual sequence
        if print_loss:
            print("logistic loss", loss)
            
        # gradient descent update
        self.Wy_first = self.Wy_first - learning_rate * Wy_first_gradient_sum
        self.Wy_second = self.Wy_second - learning_rate * Wy_second_gradient_sum
        self.by = self.by - learning_rate * by_gradient_sum
        self.Waa_first_1 = self.Waa_first_1 - learning_rate * Waa_first_1_gradient_sum
        self.Waa_first_2 = self.Waa_first_2 - learning_rate * Waa_first_2_gradient_sum
        self.ba_first = self.ba_first - learning_rate * ba_first_gradient_sum
        self.Waa_second_1 = self.Waa_second_1 - learning_rate * Waa_second_1_gradient_sum
        self.Waa_second_2 = self.Waa_second_2 - learning_rate * Waa_second_2_gradient_sum
        self.ba_second = self.ba_second - learning_rate * ba_second_gradient_sum
        self.Wax_first = self.Wax_first - learning_rate * Wax_first_gradient_sum
        self.Wax_second = self.Wax_second - learning_rate * Wax_second_gradient_sum

## Train the RNN to do a right shift of values. 
For example, [1, 0, 1] shifted right would be [0, 1, 9]

In [39]:
model = RNN()

In [40]:
model.predict([0, 1])

[0.007152809912960745, 0.9928144512645394]

In [41]:
model.ba_first

5

In [42]:
model.update_weights([1, 1],[0, 1], learning_rate=1, print_loss=True)

logistic loss 0.014357029257883496


In [43]:
model.ba_first # should be 3

5.0

In [44]:
model.predict([0,1,1,0,0])

[0.007153145366154462,
 0.992864501270733,
 0.9928970229907063,
 0.9928970229906062,
 0.992864502850085]

In [45]:
for i in range(1000000):
    print_loss = False
    if i % 1000 == 0:
        print_loss = True
    
    model.update_weights([0, 1, 0, 1, 1, 1, 1, 1],[0, 0, 1, 0, 1, 1, 1, 1], learning_rate=0.01, print_loss=print_loss)

logistic loss 9.928166952466437
logistic loss 4.088919006446303
logistic loss 2.9160551440223093
logistic loss 2.8897309059567107
logistic loss 2.86224539472021
logistic loss 2.842607826733218
logistic loss 2.8287772946719314
logistic loss 2.8189966987332955
logistic loss 2.812089573005358
logistic loss 2.807291844535451
logistic loss 2.8041059153690697
logistic loss 2.8022051601248297
logistic loss 2.8013751552837376
logistic loss 2.801477950493884
logistic loss 2.80243039837769
logistic loss 2.804191202369968
logistic loss 2.806753621456246
logistic loss 2.81014214578204
logistic loss 2.814412320981436
logistic loss 2.8196535105787746
logistic loss 2.8259949203486734
logistic loss 2.8336158117614794
logistic loss 2.842761666789876
logistic loss 2.8537693618750968
logistic loss 2.867106502910218
logistic loss 2.8834333843384576
logistic loss 2.9037005871933927
logistic loss 2.929298039646864
logistic loss 2.9622545346997002
logistic loss 3.005369011297642
logistic loss 3.0616388699149

logistic loss 0.12532300416282632
logistic loss 0.12429241697105721
logistic loss 0.12327631068119745
logistic loss 0.12227440475866368
logistic loss 0.12128642549018506
logistic loss 0.12031210578652102
logistic loss 0.11935118499181196
logistic loss 0.11840340869912139
logistic loss 0.11746852857213998
logistic loss 0.11654630217267893
logistic loss 0.11563649279382918
logistic loss 0.1147388692984905
logistic loss 0.11385320596323505
logistic loss 0.11297928232705312
logistic loss 0.11211688304513857
logistic loss 0.11126579774718229
logistic loss 0.11042582090030827
logistic loss 0.10959675167626727
logistic loss 0.1087783938229235
logistic loss 0.10797055553969623
logistic loss 0.10717304935693465
logistic loss 0.10638569201912242
logistic loss 0.10560830437162035
logistic loss 0.10484071125099333
logistic loss 0.10408274137867195
logistic loss 0.10333422725791194
logistic loss 0.10259500507392562
logistic loss 0.10186491459697747
logistic loss 0.10114379908853127
logistic loss 0.

logistic loss 0.03704557085489987
logistic loss 0.036924987325110625
logistic loss 0.036805108558363026
logistic loss 0.03668592867908504
logistic loss 0.03656744187494839
logistic loss 0.03644964239608025
logistic loss 0.036332524554195625
logistic loss 0.036216082721826415
logistic loss 0.03610031133151652
logistic loss 0.035985204875031394
logistic loss 0.03587075790259717
logistic loss 0.03575696502214794
logistic loss 0.03564382089857018
logistic loss 0.03553132025296832
logistic loss 0.03541945786195378
logistic loss 0.035308228556910154
logistic loss 0.03519762722330131
logistic loss 0.03508764879999757
logistic loss 0.034978288278573806
logistic loss 0.03486954070263486
logistic loss 0.03476140116717348
logistic loss 0.03465386481790926
logistic loss 0.03454692685065042
logistic loss 0.03444058251065078
logistic loss 0.03433482709201708
logistic loss 0.03422965593705074
logistic loss 0.034125064435696614
logistic loss 0.034021048024899
logistic loss 0.03391760218805792
logistic

logistic loss 0.01979805707449485
logistic loss 0.019759304202049747
logistic loss 0.019720689600261288
logistic loss 0.01968221255899197
logistic loss 0.01964387237284473
logistic loss 0.01960566834112088
logistic loss 0.019567599767790232
logistic loss 0.01952966596145271
logistic loss 0.019491866235296303
logistic loss 0.019454199907057698
logistic loss 0.019416666298987046
logistic loss 0.019379264737819406
logistic loss 0.019341994554725836
logistic loss 0.019304855085278703
logistic loss 0.019267845669429746
logistic loss 0.019230965651462283
logistic loss 0.019194214379958614
logistic loss 0.01915759120776222
logistic loss 0.019121095491950423
logistic loss 0.01908472659380064
logistic loss 0.019048483878748294
logistic loss 0.019012366716358885
logistic loss 0.018976374480302538
logistic loss 0.018940506548299624
logistic loss 0.018904762302116296
logistic loss 0.018869141127507563
logistic loss 0.018833642414201202
logistic loss 0.01879826555586439
logistic loss 0.018763009950

logistic loss 0.013270040734040287
logistic loss 0.013251413927473624
logistic loss 0.013232835141651985
logistic loss 0.013214304197611184
logistic loss 0.013195820917256757
logistic loss 0.01317738512335345
logistic loss 0.01315899663953082
logistic loss 0.013140655290272168
logistic loss 0.013122360900912861
logistic loss 0.01310411329762341
logistic loss 0.013085912307423632
logistic loss 0.013067757758160492
logistic loss 0.01304964947851534
logistic loss 0.013031587297990024
logistic loss 0.013013571046909814
logistic loss 0.012995600556411737
logistic loss 0.01297767565843894
logistic loss 0.012959796185752494
logistic loss 0.01294196197189738


In [47]:
def binarize(x_input):
    output = []
    
    for x in x_input:
        if x < 0.50:
            output.append(0)
        else:
            output.append(1)
            
    return output

In [48]:
binarize(model.predict([0, 1, 0, 1, 1, 1, 1, 1]))

[0, 0, 1, 0, 1, 1, 1, 1]

In [49]:
binarize(model.predict([1, 1]))

[0, 1]

In [50]:
model.predict([0, 1, 0, 1, 1, 1, 1, 1])

[3.373542276608194e-16,
 0.0032888399065128344,
 1.0,
 0.0032756576472684393,
 0.9977901439457997,
 0.9986052741477939,
 0.9986300230577,
 0.9986309764993976]

In [38]:
binarize( model.predict([0, 1, 0, 1, 1, 1, 1, 1]) )

[0, 1, 1, 1, 1, 1, 1, 1]