In [233]:
import numpy as np
from scipy.special import expit

class MultiLayerPerceptronBase(object):
    def __init__(self, layer_widths=[12,5,4,3],
                 C=0.0, epochs=500, eta=0.001, random_state=None):
        np.random.seed(random_state)
        self.l2_C = C
        self.epochs = epochs
        self.eta = eta
        self.weights = []
        self.layer_widths=layer_widths
        
    @staticmethod
    def _encode_labels(y):
        """Encode labels into one-hot representation"""
        onehot = pd.get_dummies(y).values.T
            
        return onehot

    def _initialize_weights(self):
        """Initialize weights with small random numbers."""
        
        for idx, w in enumerate(self.layer_widths[:-1]):
            W_element_count = ((w + 1) * self.layer_widths[idx+1])
            W = np.random.uniform(-1,1, size = W_element_count)
            W.reshape((self.layer_widths[idx+1], -1))
            self.weights.append(W.reshape((self.layer_widths[idx+1], -1)))    
        for w in self.weights:
            print(w.shape)
    @staticmethod
    def _sigmoid(z):
        """Use scipy.special.expit to avoid overflow"""
        # 1.0 / (1.0 + np.exp(-z))
        return expit(z)
    
    def _derive_activations(self, a, w, v, t="sigmoid"):
        if t == "sigmoid": 
            print(a.shape, w.shape, v.shape)
            return (a*(1-a) * (w[:,1:].T @ v[1:,:]))
    
    def _derive_objective(self, y, a, t="mse"):
        if t == "mse":
            return -2*(y-a)*a*(1-a)
    
    @staticmethod
    def _add_bias_unit(X, how='column'):
        """Add bias unit (column or row of 1s) to array at index 0"""
        if how == 'column':
            ones = np.ones((X.shape[0], 1))
            X_new = np.hstack((ones, X))
        elif how == 'row':
            ones = np.ones((1, X.shape[1]))
            X_new = np.vstack((ones, X))
        return X_new
    
    def _L2_reg(self, lambda_):
        """Compute L2-regularization cost"""
        # only compute for non-bias terms    
        s = 0
        for w in self.weights:
            s += np.mean(w[:, 1:] **2)
        return (lambda_/len(self.weights)) * np.sqrt(s)
    
    def _cost(self,A3,Y_enc,W1,W2):
        '''Get the objective function value'''
        cost = np.mean((Y_enc-A3)**2)
        L2_term = self._L2_reg(self.l2_C, W1, W2)
        return cost + L2_term

In [234]:
m = MultiLayerPerceptronBase()
m._initialize_weights()
m._L2_reg(0.001)

(5, 13)
(4, 6)
(3, 5)


0.0003360587229861165

In [235]:
class MultiLayerPerceptron(MultiLayerPerceptronBase):
    def _feedforward(self, X):
        Z = []
        A = []
        x = X
        A.append(self._add_bias_unit(x.T, how="row"))
        for idx, w in enumerate(self.weights[:-1]):
#                 z_l = w @ self._add_bias_unit(A[-1], how = "row")
            z_l = w @ A[-1]
            a_l = self._sigmoid(z_l)
            a_l = self._add_bias_unit(a_l, how="row")
            A.append(a_l)
            Z.append(z_l)

            print(a_l.shape)
        A.append(self._sigmoid(Z[-1]))
        print(A[-1].shape)
        return A, Z
    def _get_gradient(self, A, Y_enc):
        gradients = []
        v_last = self._derive_objective( Y_enc, A[-1], "mse")
        grad = v_last.T @ A[-2][1:,:]
        gradients.append(grad)
        for idx in range(len(self.weights), 0, -1):
            v_l = self._derive_activations(A[idx], self.weights[idx-1], v_last, "sigmoid")
            grad = v_l[1:,:] @ A[idx-1].T
            print(grad, "here")
            gradients.append(grad)
            v_last = v_l

        return list(reversed(gradients))

In [236]:
m = MultiLayerPerceptron()
m._initialize_weights()
A, Z = m._feedforward(np.array([1,2,3,4,5,6,7,8,9,10,11,12]).reshape(1, 12))
grad = m._get_gradient(A, np.array([1,2,4]))
print(grad)
    

(5, 13)
(4, 6)
(3, 5)
(6, 1)
(5, 1)
(4, 1)
(4, 1) (3, 5) (4, 3)


ValueError: shapes (3,3) and (1,5) not aligned: 3 (dim 1) != 1 (dim 0)