In [None]:
import numpy as np

### Weight initializers

In [None]:
class RandomInitializer():
    def initialize(self, shape):
        W = np.random.randn(shape[0], shape[1])
        return W

class ZerosInitializer():
    def initialize(self, shape):
        W = np.zeros(shape)
        return W

class HeInitializer():
    def initialize(self, shape):
        W = np.random.randn(shape[0], shape[1]) * np.sqrt(2 / shape[1])
        return W


## Activation funcs

In [None]:
class RELU():
    def activate(self, Z):
        return Z * (Z > 0)

    def derivative(self, Z):
        return 1 * (Z > 0)


class Sigmoid():
    def activate(self, Z):
        return 1 / (1 + np.exp(-Z))

    def derivative(self, Z):
        return self.activate(Z) * (1 - self.activate(Z))


class Linear():
    def activate(self, Z):
        return Z

    def derivative(self, Z):
        return (np.ones(Z.shape))



## Costs

In [None]:
class BinaryCrossEntropy():
    def compute_cost(self, y_pred, y_true):
        #shape y_pred and y_true = (1, m_examples)
        m = y_true.shape[1]

        #lets cut off a  tiny constant to avoid log0 problem
        epsilon = 10 ** -15
        y_pred = np.clip(y_pred, epsilon, 1-epsilon)

        cost =  -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

        cost = np.sum(cost, axis=1, keepdims=True) * (1 / m)

        return (cost)

    def derivative(self, y_pred, y_true):

        #Do it to avoid division by 0
        epsilon = 10 ** -15
        y_pred = np.clip(y_pred, epsilon, 1-epsilon)

        dA = - (y_true / y_pred) + (1 - y_true) / (1 - y_pred)

        return (dA)

## Layers

In [None]:
class Layer():
    def __init__(self, n_units, activation, l2_reg=0, weight_initializer=HeInitializer):
        self.activation = activation
        self.n_units = n_units
        self.l2_reg =l2_reg

        self.activation = activation()

        #initialize cache
        self.Z = None
        self.A = None

        #initialize params
        #waiting for initializing the model
        self.initializer = weight_initializer
        self.W = None
        self.b = ZerosInitializer().initialize((n_units, 1))

        #We will need them for Adam and Momentum
        #moments
        self.V_dW = None
        self.V_db = ZerosInitializer().initialize((n_units, 1))
        #RMS_prop part
        self.S_dw = None
        self.S_db = ZerosInitializer().initialize((n_units, 1))


        #grads
        self.dZ = None
        self.dA = None

        self.dW = None
        self.db = None

    def initialize(self, n_units_prev):
        shape = (self.n_units, n_units_prev)
        self.W = self.initializer().initialize(shape)

        #initialize params in case we use Adam/Momentum
        self.V_dW = ZerosInitializer().initialize(shape)
        self.S_dW = ZerosInitializer().initialize(shape)


    def forward_propogation(self, A_prev):
        #keep A_prev for backprop
        self.A_prev = A_prev

        self.Z = np.dot(self.W, A_prev) + self.b
        self.A = self.activation.activate(self.Z)

        return (self.A)

    def back_propogation(self, W_next=None, dZ_next=None, dA_final=None):

        batch_size = self.Z.shape[1]

        #Check for valid input
        if dA_final is None:
            if W_next is None or dZ_next is None:
                raise ValueError("Either both W_next and dZ_next must be provided, or dA_final must be provided.")


        #compute and keep gradients
        #dA_final is a specific case, where our layer is final and we compute cost derivs
        if dA_final is not None:
            self.dA = dA_final
        else:
            self.dA = np.dot(W_next.T, dZ_next)

        self.dZ = self.dA * self.activation.derivative(self.Z)

        #regularization
        l2_term = (self.l2_reg / batch_size) * self.W

        self.dW = np.dot(self.dZ, self.A_prev.T) * (1 / batch_size) + l2_term
        self.db = np.sum(self.dZ, axis=1, keepdims=True) * (1 / batch_size)


## Optimizers

In [None]:
class GradientDescent():
    def __init__(self, learning_rate):
        self.counter = 1
        self.learning_rate = learning_rate

    def update(self, layer):
        #update params of layer
        layer.W = layer.W - self.learning_rate * layer.dW
        layer.b = layer.b - self.learning_rate * layer.db

    def tick(self):
        self.counter += 1


class Momentum():
    def __init__(self, learning_rate, beta=0.9, bias_correction=False):

        self.counter = 1
        self.learning_rate = learning_rate
        self.beta = beta

        self.bias_correction = bias_correction
        self.epsilon = 10 ** -8

    def update(self, layer):

        #compute new velocities
        layer.V_dW = self.beta * layer.V_dW + (1 - self.beta) * layer.dW
        layer.V_db = self.beta * layer.V_db + (1 - self.beta) * layer.db

        if self.bias_correction:
            #correct velocities
            layer.V_dW = layer.V_dW / (1 - self.beta ** self.counter)
            layer.V_db = layer.V_db / (1 - self.beta ** self.counter)


        #update params
        layer.W = layer.W - self.learning_rate * layer.V_dW
        layer.b = layer.b - self.learning_rate * layer.V_db

    def tick(self):
        self.counter += 1


class Adam():
    def __init__(self, alpha, beta1=0.9, beta2=0.99, bias_correction=False):

        self.counter = 1

        self.alpha = alpha
        self.beta1 = beta1 #Momentum
        self.beta2 = beta2 #RMSprop

        self.bias_correction = bias_correction

        self.epsilon = 10 ** -8

    def update(self, layer):
        #compute new velocities
        layer.V_dW = self.beta1 * layer.V_dW + (1 - self.beta1) * layer.dW
        layer.V_db = self.beta1 * layer.V_db + (1 - self.beta1) * layer.db

        #compute new second moments
        layer.S_dW = self.beta2 * layer.S_dW + (1 - self.beta2) * np.square(layer.dW)
        layer.S_db = self.beta2 * layer.S_db + (1 - self.beta2) * np.square(layer.db)


        if self.bias_correction:
            #correct velocities
            layer.V_dW = layer.V_dW / (1 - self.beta1 ** self.counter)
            layer.V_db = layer.V_db / (1 - self.beta1 ** self.counter)

            #correct 2nd moments
            layer.S_dW = layer.S_dW / (1 - self.beta2 ** self.counter)
            layer.S_db = layer.S_db / (1 - self.beta2 ** self.counter)

        #UPdate parameters

        layer.W = layer.W - self.alpha * (layer.V_dW / (np.sqrt(layer.S_dW) + self.epsilon))
        layer.b = layer.b - self.alpha * (layer.V_db / (np.sqrt(layer.S_db) + self.epsilon))

    def tick(self):
        self.counter += 1






In [37]:
class Model():
  def __init__(self, X, Y,layers):
    #X has shape (n_features, m_examples)
    #Y has shape (1, m_examples)

    self.layers = layers
    self.n_layers = len(layers)

    self.X = X
    self.Y = Y

    self.minibatches = [] #list of tuples (X_batch, Y_batch)

    self.costfunc = None
    self.optimizer = None

  def _init_weights(self):

    n_unit_prev = self.X.shape[0]

    for layer in self.layers:
      layer.initialize(n_unit_prev)

      #set n_units_prev for next initialization
      n_unit_prev = layer.n_units

  def _make_minibatches(self, batch_size=None):
    if batch_size is None:
      self.minibatches.append((self.X, self.Y))

    #TODO: implement splitting into minibatches here...

  def compile(self, costfunc, optimizer):
    self.costfunc = costfunc
    self.optimizer = optimizer


  def predict(self, X):
    A_prev = X

    for layer in self.layers:
      A = layer.forward_propogation(A_prev)
      A_prev = A

    return A

  def _backprop(self, dA_final):

      L = self.n_layers

      #mannualy set backprop for last layer
      self.layers[L - 1].back_propogation(dA_final=dA_final)

      #loop goes from L - 2 up to 0
      for l in range(L - 2, -1, -1):
        self.layers[l].back_propogation(self.layers[l + 1].W,
                                        self.layers[l + 1].dZ)

  def _update_all_params(self):
    #goes over layers updating params using computed gradients
    for layer in self.layers:
      self.optimizer.update(layer)

    self.optimizer.tick()


  def fit(self, epochs, batch_size=None):

    #init weights
    self._init_weights()

    #split into minibatches
    self._make_minibatches(batch_size)


    history = []
    for epoch in range(1, epochs + 1):

      #compute cost
      all_predictions = self.predict(self.X)
      total_cost = self.costfunc.compute_cost(all_predictions, self.Y)
      print("Epoch #{}, cost = {}".format(epoch, total_cost))


      for batch in self.minibatches:
        X_batch, Y_batch = batch

        batch_predictions = self.predict(X_batch)

        dA_final = self.costfunc.derivative(batch_predictions, Y_batch)

        self._backprop(dA_final)
        self._update_all_params()








In [31]:
l1 = Layer(3, activation=Sigmoid)


X = np.array([[3, 4],
              [0, 3],
               [2, 4],
               [0 , 0],
                [3, 0]]).T

y = np.array([[1, 0, 1, 0, 1]])


In [74]:
layers = [
       Layer(n_units=3, activation = RELU),
       Layer(n_units=1, activation = Sigmoid)
]

model = Model(X, y, layers)

model.compile(costfunc = BinaryCrossEntropy(),
              optimizer = Adam(0.1))

In [75]:
model.fit(30)

Epoch #1, cost = [[2.59946848]]
Epoch #2, cost = [[1.93706964]]
Epoch #3, cost = [[1.23842055]]
Epoch #4, cost = [[0.70244106]]
Epoch #5, cost = [[0.50651027]]
Epoch #6, cost = [[0.55708919]]
Epoch #7, cost = [[0.61476996]]
Epoch #8, cost = [[0.62907016]]
Epoch #9, cost = [[0.60131499]]
Epoch #10, cost = [[0.563903]]
Epoch #11, cost = [[0.47974722]]
Epoch #12, cost = [[0.41282704]]
Epoch #13, cost = [[0.34181868]]
Epoch #14, cost = [[0.29051136]]
Epoch #15, cost = [[0.26282052]]
Epoch #16, cost = [[0.23645504]]
Epoch #17, cost = [[0.21179214]]
Epoch #18, cost = [[0.18906917]]
Epoch #19, cost = [[0.16840333]]
Epoch #20, cost = [[0.14981368]]
Epoch #21, cost = [[0.13324384]]
Epoch #22, cost = [[0.11858368]]
Epoch #23, cost = [[0.10568874]]
Epoch #24, cost = [[0.09439627]]
Epoch #25, cost = [[0.08453773]]
Epoch #26, cost = [[0.07594779]]
Epoch #27, cost = [[0.0684703]]
Epoch #28, cost = [[0.0619618]]
Epoch #29, cost = [[0.05629318]]
Epoch #30, cost = [[0.05135004]]


In [76]:
model.predict(X)

array([[0.99999764, 0.11051053, 0.99905918, 0.11051053, 0.99999788]])

In [None]:
layer1 = Layer(n_units=3, activation = RELU)
layer2 = Layer(1, Sigmoid)
costf = BinaryCrossEntropy()

layer1.initialize(2)
layer2.initialize(3)



alpha = 0.01
epochs = 100

optimizer = Adam(alpha)

for i in range(epochs+1):
    a1 = layer1.forward_propogation(X)
    a2 = layer2.forward_propogation(a1)

    cost = costf.compute_cost(a2, y)
    if (i % (epochs//10) == 0):
        print("i = {}, cost = {}".format(i, cost))


    dA_final = costf.derivative(a2, y)

    #back_prop
    layer2.back_propogation(dA_final=dA_final)
    layer1.back_propogation(layer2.W, layer2.dZ)

    #update params
    optimizer.update(layer1)
    optimizer.update(layer2)
    optimizer.tick()


i = 0, cost = [[0.62000051]]
i = 10, cost = [[0.57468473]]
i = 20, cost = [[0.52557084]]
i = 30, cost = [[0.46696421]]
i = 40, cost = [[0.39693643]]
i = 50, cost = [[0.31846955]]
i = 60, cost = [[0.23861208]]
i = 70, cost = [[0.17038884]]
i = 80, cost = [[0.12540482]]
i = 90, cost = [[0.10113414]]
i = 100, cost = [[0.08754819]]


In [None]:
a1 = layer1.forward_propogation(X)
a2 = layer2.forward_propogation(a1)

a2

array([[0.99646007, 0.17372499, 0.96538687, 0.17372499, 0.98819461]])

In [None]:
layer2.W

array([[1.2004948 , 0.41839818, 0.24593324]])