Q1. What is different than linear regression?
Q2. How to tune the parameters?
Q3. Why do we need hidden layers?

<div style="text-align: center;">
    <img src="./files/DNN/DNN_example.png" width="400" height="250">
</div>

### Each element formulation

$$
\begin{align*}
z_1 &= x_1w_1 + x_2w_2 + b_1 \\
z_2 &= x_1w_3 + x_2w_4 + b_2 \\
z_3 &= h_1w_5 + h_2w_6 + b_3 \\
z_4 &= h_1w_7 + h_2w_8 + b_4 \\

h_1 &= \sigma (z_1) \\
h_2 &= \sigma (z_2) \\

o_1 &= \sigma (z_3) \\
o_2 &= \sigma (z_4) \\

E_{o1} &= \frac{1}{2}(y_1 - o_1)^2 \\
E_{o2} &= \frac{1}{2}(y_2 - o_2)^2 \\
E_{total} &= E_{o1} + E_{o2} \\

\end{align*}
$$

### Backpropagation 1

Fitting the gradient of $w_5$:
$$ \begin{align*}
\frac{\partial E_{total}}{\partial w_5} &= \frac{\partial E_{total}}{\partial o_1} \cdot \frac{\partial o_1}{\partial z_3} \cdot \frac{\partial z_3}{\partial w_5} \\
&= \frac{\partial (E_{o1} + E_{o2})}{\partial o_1} \cdot \frac{\partial o_1}{\partial z_3} \cdot \frac{\partial z_3}{\partial w_5} \\
& = -(y_1 - o_1) \cdot \sigma '(z_3) \cdot h_1
\end{align*} $$
Using optimizer, we can learn the $w_5$ into more optimized value.

Fitting the gradient of $b_3$, is very similar to $w_5$:
$$ \begin{align*}
\frac{\partial E_{total}}{\partial w_5} &= \frac{\partial E_{total}}{\partial o_1} \cdot \frac{\partial o_1}{\partial z_3} \cdot \frac{\partial z_3}{\partial b_3} \\
&= \frac{\partial (E_{o1} + E_{o2})}{\partial o_1} \cdot \frac{\partial o_1}{\partial z_3} \cdot \frac{\partial z_3}{\partial b_3} \\
& = -(y_1 - o_1) \cdot \sigma '(z_3) \cdot 1
\end{align*} $$

Using this method, we can fit $w_5, w_6, w_7, w_8, b_3, b_4$.

$$

\begin{align*}
\nabla w_5 &= \frac{\partial E_{total}}{\partial o_1} \cdot \frac{\partial o_1}{\partial z_3} \cdot \frac{\partial z_3}{\partial w_5} \\
&= -(y_1 - o_1) \cdot \sigma '(z_3) \cdot h_1 \\

\nabla w_7 &= \frac{\partial E_{total}}{\partial o_2} \cdot \frac{\partial o_2}{\partial z_4} \cdot \frac{\partial z_4}{\partial w_7} \\
& = -(y_2 - o_2) \cdot \sigma '(z_4) \cdot h_1 \\

\nabla w_6 &= \frac{\partial E_{total}}{\partial o_1} \cdot \frac{\partial o_1}{\partial z_3} \cdot \frac{\partial z_3}{\partial w_6}\\
&= -(y_1 - o_1) \cdot \sigma '(z_3) \cdot h_2 \\

\nabla w_8 &= \frac{\partial E_{total}}{\partial o_2} \cdot \frac{\partial o_2}{\partial z_4} \cdot \frac{\partial z_4}{\partial w_8} \\
& = -(y_2 - o_2) \cdot \sigma '(z_4) \cdot h_2 \\

\nabla b_3 &= \frac{\partial E_{total}}{\partial o_1} \cdot \frac{\partial o_1}{\partial z_3} \cdot \frac{\partial z_3}{\partial b_3} \\
&= -(y_1 - o_1) \cdot \sigma '(z_3) \cdot 1 \\

\nabla b_4 &= \frac{\partial E_{total}}{\partial o_2} \cdot \frac{\partial o_2}{\partial z_4} \cdot \frac{\partial z_4}{\partial b_4} \\
& = -(y_2 - o_2) \cdot \sigma '(z_4) \cdot 1 \\ 

\end{align*}

###  Backpropagation 2

Going one more step forward, now getting the gradients of $w_1, w_2, w_3, w_4, b_1, b_2$.

We first need to calculate the derivative of $h_1, h_2$.
$$
\begin{align*}
    \frac{\partial E_{total}}{\partial h_1} &= \frac{\partial E_{o1}}{\partial h_1} + \frac{\partial E_{o2}}{\partial h_1} \\
    
    &=  \frac{\partial E_{o1}}{\partial z_3} \cdot \frac{\partial z_3}{\partial h_1}  + \frac{\partial E_{o2}}{\partial z_4} \cdot \frac{\partial z_4}{\partial h_1}\\
    
    &= \frac{\partial E_{o1}}{\partial o_1} \cdot \frac{\partial o_1}{\partial z_3}  \cdot \frac{\partial z_3}{\partial h_1} + \frac{\partial E_{o2}}{\partial o_2} \cdot \frac{\partial o_2}{\partial z_4}  \cdot \frac{\partial z_4}{\partial h_1}  \\

    & = -(y_1 - o_1) \cdot \sigma '(z_3) \cdot w_5 + -(y_2 - o_2) \cdot \sigma ' (z_4) \cdot w_7 \\

    \frac{\partial E_{total}}{\partial h_2} &= \frac{\partial E_{o1}}{\partial h_2} + \frac{\partial E_{o2}}{\partial h_2} \\

    &= \frac{\partial E_{o1}}{\partial z_3} \cdot \frac{\partial z_3}{\partial h_2} + \frac{\partial E_{o2}}{\partial z_4} \cdot \frac{\partial z_4}{\partial h_2} \\

    &= \frac{\partial E_{o1}}{\partial o_1} \cdot \frac{\partial o_1}{\partial z_3} \cdot \frac{\partial z_3}{\partial h_2} + \frac{\partial E_{o2}}{\partial o_2} \cdot \frac{\partial o_2}{\partial z_4} \cdot \frac{\partial z_4}{\partial h_2} \\

    &= -(y_1 - o_1) \cdot \sigma '(z_3) \cdot w_6 + -(y_2 - o_2) \cdot \sigma '(z_4) \cdot w_8
\end{align*}
$$

Using this, the final step is here.

$$

\begin{align*}
    \frac{\partial E_{total}}{\partial w_1} &= \frac{\partial E_{total}}{\partial h_1} \cdot \frac{\partial h_1}{\partial z_1} \cdot \frac{\partial z_1}{\partial w_1} \\
    &= \frac{\partial E_{total}}{\partial h_1} \cdot \sigma ' (z_1) \cdot x_1 \\

    \frac{\partial E_{total}}{\partial w_2} &= \frac{\partial E_{total}}{\partial h_1} \cdot \frac{\partial h_1}{\partial z_1} \cdot \frac{\partial z_1}{\partial w_2} \\
    &= \frac{\partial E_{total}}{\partial h_1} \cdot \sigma ' (z_1) \cdot x_2 \\

    \frac{\partial E_{total}}{\partial w_3} &= \frac{\partial E_{total}}{\partial h_2} \cdot \frac{\partial h_2}{\partial z_2} \cdot \frac{\partial z_2}{\partial w_3} \\
    &= \frac{\partial E_{total}}{\partial h_2} \cdot \sigma ' (z_2) \cdot x_1 \\

    \frac{\partial E_{total}}{\partial w_4} &= \frac{\partial E_{total}}{\partial h_2} \cdot \frac{\partial h_2}{\partial z_2} \cdot \frac{\partial z_2}{\partial w_4} \\
    &= \frac{\partial E_{total}}{\partial h_2} \cdot \sigma ' (z_2) \cdot x_2 \\

    \frac{\partial E_{total}}{\partial b_1} &= \frac{\partial E_{total}}{\partial h_1} \cdot \frac{\partial h_1}{\partial z_1} \cdot \frac{\partial z_1}{\partial b_1} \\
    &= \frac{\partial E_{total}}{\partial h_1} \cdot \sigma ' (z_1) \cdot 1 \\

    \frac{\partial E_{total}}{\partial b_2} &= \frac{\partial E_{total}}{\partial h_2} \cdot \frac{\partial h_2}{\partial z_2} \cdot \frac{\partial z_2}{\partial b_2} \\
    &= \frac{\partial E_{total}}{\partial h_1} \cdot \sigma ' (z_1) \cdot 1 \\
\end{align*}
$$

### Generalization

Generalization is pretty simpler:

$$
\begin {align*}
\frac{\partial L}{\partial w_N} &= \frac{\partial L}{\partial a_N} \cdot \frac{\partial a_N}{\partial z_N}  \cdot \frac{\partial z_N}{\partial w_N} \\
\frac{\partial L}{\partial b_N} &= \frac{\partial L}{\partial a_N} \cdot \frac{\partial a_N}{\partial z_N} \cdot 
(\frac{\partial z_N}{\partial b_N} =1) \\
&= \frac{\partial L}{\partial a_N} \cdot \frac{\partial a_N}{\partial z_N}
\end {align*}
$$

Break then into each partial derivative pieces, 

$$
\frac{\partial L}{\partial a_N}: \text{loss in output layer.} \\ \\
\frac{\partial a_N}{\partial z_N}: \text{partial derivative of activation function.} \\ \\
\frac{\partial z_N}{\partial w_N / b_N}: \text{partial derivative of weights, or bias.} \\ \\
$$

### Implementation

In [None]:
import numpy as np
from tqdm import tqdm

class DNN:
    def __init__(self, input_size, hidden_layers, output_size, learning_rate = 0.01):
        '''
        Initiate the terms in the DNN according to the parameters.
        
        Parameters:
        input_size (int): size of the input
        hidden_layers (list): number of neurons of each layer
        output_size (int): size of the output
        learning_rate (int): learning rate
        
        Returns:
        self.input_size (int): input size ([1 x input_size])
        self.hidden_layers (int): hidden layer sizes list
        self.output_size (int): output size
        self.learning_rate (int): learning rate
        self.weights (list): [#layer x #prev_layer x #current_layer] randomized weights
        self.biases (list): [#layer x #current_layer x 1] randomized biases
        '''
        
        self.input_size = input_size
        self.hidden_layers = hidden_layers
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.weights = []
        self.biases = []
        
        #initialize weights and biases for the hidden layer + output layer.
        prev_layer_size = self.input_size
        for hidden_layer_size in (self.hidden_layers + [output_size]):
            weight = np.random.randn(prev_layer_size, hidden_layer_size) * 0.01 # std_norm_dist * 0.01
            bias = np.zeros((1, hidden_layer_size))
            self.weights.append(weight)
            self.biases.append(bias)
            prev_layer_size = hidden_layer_size
            
    def forward_propagation(self, X):
        '''
        Forward propagation to get initial guesses
        
        Parameters:
        X (list): input list
        
        Returns:
        self.a (list): X + every neuron's value after the activation function.
        self.z (list): X + every neuron's value before the activation function.
        '''
        self.a = [X]
        self.z = []
        for i in range(self.hidden_layers + 1):
            z = np.dot(self.a[i], self.weights[i]) + self.bias[i]
            self.z.append(z)
            a = self.sigmoid(z)
            self.a.append(a)
        return self.a[-1] # Return output layer activations
    
    def backward_propagation(self, X, y):
        '''
        Backward propagation to get gradients for every weights, and biases.
        
        Parameters:
        X (list): batched list (batch_size x input_size).
        Example of the X
        X = np.array([[0.5, 0.1, -0.2],  # First sample, 3 features
              [0.2, 0.4, 0.1]])  # Second sample, 3 features
        Total of 2 batch size
        where sample: one lump of data, features: ex) a person's height, or weight, or etc.
        
        y (list): true Y for each sample (batch_size y x output_size).
        
        Returns: 
        np.array(dW[::-1]) (list): gradient of W 
        np.array(db[::-1]) (list): gradient of b
        '''    
        # Calculate output Layer error
        self.batch_size = X.shape[0] # number of batches
        
        y_pred = self.forward(X)

        # Loss function derivative
        loss_derivative = self.loss_derivative(y_pred, y) # = d L / d a_N
        
        # Backpropagation through layers
        dA = loss_derivative * self.sigmoid_derivative(self.a[-1]) # Output Layer; * d a_N / d z_N
        dZ = dA # For output layer, dZ = dA because sigmoid derivative is applied
        
        # Update weights and biases for the output layer
        dW = np.dot(self.a[-2].T, dZ) # * d z_N / d w_N
        db = np.sum(dZ, axis=0, keepdims=True) # dZ itself
        
        # Propagate the gradient back through the hidden layers
        dA = np.dot(dZ, self.weights[-1].T)  # Error propagated back to previous layer
        for i in range(len(self.weights) - 2, -1, -1):  # Loop through hidden layers
            dZ = dA * self.sigmoid_derivative(self.a[i + 1])
            dW.append(np.dot(self.a[i].T, dZ)) # * d z_N / d w_N
            db.append(np.sum(dZ, axis=0, keepdims=True)) # dZ itself
            dA = np.dot(dZ, self.weights[i].T)
        return np.array(dW[::-1]), np.array(db[::-1])
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def compute_loss(self, y_pred, y_true):
        # Mean Squared Error loss function
        return np.mean((y_true - y_pred) ** 2)
    
    def loss_derivative(self, y_pred, y_true):
        # Mean Squared Error loss' derivative function
        return 2 * (y_pred - y_true) / self.batch_size
        
    def apply_gradient(self, dW, db):
        # Update gradients and biases using the gradient (gradient descent)
        for i in range(len(self.weights)):
            self.weights[i] -= self.learning_rate * dW[i]
            self.biases[i] -= self.learning_rate * db[i]
    
    def train(self, X, y, epochs=1000, print_rate = 10):
        for epoch in tqdm(epochs):
            self.backward_propagation(X, y)
            if epoch % print_rate == 0:
                y_pred = self.forward(X)
                loss = self.compute_loss(y_pred, y)
                print(f"Epoch {epoch}, Loss: {loss}")
                
    def train_mini_batch(self, X, y, batch_size=32, epoches=1000, print_rate = 10):
        m = X.shape[0]
        for epoch in range(epoches):
            indices = np.random.permutation(m)
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            for i in range(0, m, batch_size):  # Repetition in batch_size measure
                # Extraction
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]
                
                # 현재 배치에 대해 역전파 수행
                dW, db = self.backward(X_batch, y_batch)
                
                # 가중치와 편향 업데이트
                self.apply_gradient(dW, db)

In [7]:
A = np.array([1, 2, 3, 4, 5])
print(A.shape[0])

5
