Reference 
https://mycollegenotebook.medium.com/backpropagation-algorithm-%E5%8F%8D%E5%90%91%E5%82%B3%E6%92%AD%E6%BC%94%E7%AE%97%E6%B3%95-b6800c015044     
https://datasciocean.tech/deep-learning-core-concept/backpropagation-explain/

In [75]:
# Import Libs
import matplotlib.pyplot as plt
import numpy as np



In [76]:
# Functions to generate datasets

# The size of  training,test data are both n by 2
def generate_linear(n=100):

    import numpy as np
    pts = np.random.uniform(0, 1, (n, 2))
    inputs = []
    labels = []
    for pt in pts:
        inputs.append([pt[0], pt[1]])
        distance = (pt[0] - pt[1]) / 1.414
        if pt[0] > pt[1]:
            labels.append(0)
        else:
            labels.append(1)
    return np.array(inputs), np.array(labels).reshape(n, 1)


def generate_XOR_easy():
    import numpy as np
    inputs = []
    labels = []

    for i in range(11):
        inputs.append([0.1 * i, 0.1 * i])
        labels.append(0)

        if 0.1 * i == 0.5:
            continue

        inputs.append([0.1 * i, 1 - 0.1 * i])
        labels.append(1)

    return np.array(inputs), np.array(labels).reshape(21, 1)



In [77]:
# Plot results

def show_result(x,y,pred_y):
    import matplotlib.pyplot as plt
    plt.subplot(1,2,1)
    plt.title('Ground Truth',fontsize=18)
    for i in range(x.shape[0]):
        if y[i] == 0:
            plt.plot(x[i][0],x[i][1],'ro')
        else:
            plt.plot(x[i][0],x[i][1],'bo')
    plt.subplot(1,2,2)
    plt.title('Predict result',fontdict=18)

    for i in range(x.shape[0]):
        if pred_y[i] == 0:
            plt.plot(x[i][0],x[i][1],'ro')
        else:
            plt.plot(x[i][0],x[i][1],'bo')

    plt.show()


In [78]:
# Activative functions and their derivatives
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))


def derivative_sigmoid(x):
    return np.multiply(x, 1.0 - x)

# Mean Squared Error (MSE) function
# Returns the average of the squared differences between predicted and true values and the derivative of MSE
def MSE(y_true, y_pred):
    mse = np.mean(np.square(y_true - y_pred))
    derivative = 2 * (y_pred - y_true) / y_true.size
    return mse, derivative

activation_map = {
    'sigmoid': sigmoid
}



In [None]:
# Fully connected layer class
class Linear_Layer:
    """
        Example:
            Input : (n,m,activation function) , the weight maxtrix W is n by m
            z = xW + b , with size
            a = activation(z)
            delta : size m by 1
            dc : delta value of current layer
            dn : delta value of next layer (forward pass direction : layer c -> layer n)

            the delta value of output layer  = dC/da * derivative of activation function with input z

            the delta value of hidden layer = W^T(weight of current layer and next layer) * dn  * derivative of activation function with input z

            the gradient of weight matrix W = input * delta
            the gradient of bias b = delta
    """

    def __init__(self, input_size, output_size, activation='sigmoid'):

        if activation not in activation_map:
            activation = 'sigmoid'  # Default activation function

        self.input_size = input_size
        self.output_size = output_size
        # Initialize weights and bias (W , b)
        # Assume that the size of input vector is  n by 1
        # The size of weight matrix should be m by n, where m is the size of output vector

        # According to the guide that TA provided, the initial weights often use a small random value

        self.weights = np.random.randn(input_size, output_size) * 0.01
        self.bias = np.zeros((output_size, 1))
        self.input = None
        self.a = None  # The output of the layer after activation function is applied
        self.z = None  # The output of the layer before activation function is applied (z = Wx + b)
        self.activation = activation

        # delta = partial derivative of the loss function with respect to the output of this layer before applied activation function
        self.delta = None  # The delta value for backpropagation

    #   Forward pass: z = xW + b
    #   Ｒegular training process
    def forward(self, x):
        self.input = x
        self.z = np.dot(x,self.weights) + self.bias
        self.a = activation_map[self.activation](self.z)
        return self.a

    def backward(self, upstream_delta):
        # TODO : implement different activation function
        self.delta = np.dot(self.weights,
                               upstream_delta) * derivative_sigmoid(self.z)

        # TODO : update learning rate
        # Update weights and bias
        self.weights -= np.dot(self.input,self.delta) * 0.01
        self.bias -= self.delta * 0.01

        return self.delta


# TODO : model with hyperparameters
class Model:

    def __init__(self):
        self.loss = []
        self.layers = []

        # Build model
        self.layers.append(Linear_Layer(2, 4, activation='sigmoid'))
        self.layers.append(Linear_Layer(4, 4, activation='sigmoid'))
        self.layers.append(Linear_Layer(4, 1, activation='sigmoid'))

        self.num_of_layers = len(self.layers)

    def forward(self, x):
        # Forward pass through all layers
        for layer in self.layers:
            x = layer.forward(x)
        return x

    # Train the model
    # # x : input data, y : ground truth labels
    def train(self, x, y, epochs=100000, learning_rate=0.01):

        for epoch in range(epochs):
            # Forward pass
            output = self.forward(x)

            # Compute loss and its derivative
            loss, delta = MSE(y, output)

            self.loss.append(loss)

            if epoch % 1000 == 0:
                print(f'Epoch {epoch}, Loss: {loss:.4f}')
            # Backward pass
            for i in range(self.num_of_layers - 1, -1, -1):
                delta = self.layers[i].backward(delta)

    def plot_learning_curve(self):
        import matplotlib.pyplot as plt
        plt.plot(self.loss)
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.title('Learning Curve')
        plt.show()


In [80]:
x,y = generate_linear()
model = Model()
model.train(x,y)
model.plot_learning_curve()

ValueError: operands could not be broadcast together with shapes (100,4) (4,1) 