In [None]:
import numpy as np

# Define the input data (15 days of sales, 1 value per day)
data = np.array([100, 90, 120, 80, 150, 110, 130, 140, 115, 105, 90, 125, 135, 100, 95])

# Define the number of time steps in the input sequence
num_time_steps = 3

# Define the number of input features (in this case, we only have 1 input feature: sales)
num_input_features = 1

# Define the number of hidden units in the RNN layer
num_hidden_units = 5

# Initialize the weight matrices and bias vectors for the RNN layer
W_xh = np.random.randn(num_input_features, num_hidden_units)
W_hh = np.random.randn(num_hidden_units, num_hidden_units)
b_h = np.zeros((1, num_hidden_units))
W_hy = np.random.randn(num_hidden_units, num_input_features)
b_y = np.zeros((1, num_input_features))

# Initialize the hidden state (this is the initial state before processing any input)
h_t = np.zeros((1, num_hidden_units))

# Define the learning rate
learning_rate = 0.001

# Loop through the input sequence and update the hidden state at each time step
for t in range(num_time_steps):
    # Extract the input features for the current time step
    x_t = data[t]
    
    # Compute the new hidden state using the current input and previous hidden state
    h_t = np.tanh(np.dot(x_t, W_xh) + np.dot(h_t, W_hh) + b_h)
    
# Compute the output at the last time step
y_pred = np.dot(h_t, W_hy) + b_y

# Compute the target values (in this case, we want to predict the next day's sales based on the previous 3 days of sales)
y_true = data[num_time_steps:num_time_steps+1]

# Compute the error (mean squared error)
error = 0.5 * np.sum((y_pred - y_true) ** 2)

# Compute the gradients of the output layer (using the chain rule)
grad_y = y_pred - y_true
grad_W_hy = np.dot(h_t.T, grad_y)
grad_b_y = np.sum(grad_y, axis=0, keepdims=True)

# Initialize the gradient of the hidden state (this will be used as the initial gradient for backpropagation)
grad_h = np.zeros((1, num_hidden_units))

# Loop backward through the time steps and compute the gradients for each time step
for t in reversed(range(num_time_steps)):
    # Extract the input features for the current time step
    x_t = data[t]
    
    # Compute the gradients for the output of the RNN layer (using the chain rule)
    grad_output = grad_h + np.dot(grad_y, W_hy.T)
    grad_z = grad_output * (1 - h_t ** 2)
    
    # Compute the gradients for the parameters of the RNN layer (using the chain rule)
    grad_W_xh = np.dot(x_t.T, grad_z)
    grad_W_hh = np.dot(h_t.T, grad_z)
    grad_b_h = np.sum(grad_z, axis=0, keepdims=True)
    
    # Update the gradients for the next time step (using the chain rule)
    grad_h = np.dot(grad_z,W_hh.T)

# Update the parameters of the RNN layer using the computed gradients and the learning rate
W_xh -= learning_rate * grad_W_xh
W_hh -= learning_rate * grad_W_hh
b_h -= learning_rate * grad_b_h
W_hy -= learning_rate * grad_W_hy
b_y -= learning_rate * grad_b_y

print("W_xh:", W_xh)
print("W_hh:", W_hh)
print("b_h:", b_h)
print("W_hy:", W_hy)
print("b_y:", b_y)

In [16]:
import numpy as np

# Define the input data (15 days of sales, 1 value per day)
data = np.array([100, 90, 120, 80, 150, 110, 130, 140, 115, 105, 90, 125, 135, 100, 95])

# Define the number of time steps in the input sequence
num_time_steps = 3

# Define the number of input features (in this case, we only have 1 input feature: sales)
num_input_features = 1

# Define the number of hidden units in the RNN layer
num_hidden_units = 5

# Define the batch size
batch_size = 4

# Initialize the weight matrices and bias vectors for the RNN layer
W_xh = np.random.randn(num_input_features, num_hidden_units)
W_hh = np.random.randn(num_hidden_units, num_hidden_units)
b_h = np.zeros((1, num_hidden_units))
W_hy = np.random.randn(num_hidden_units, num_input_features)
b_y = np.zeros((1, num_input_features))

# Initialize the hidden state (this is the initial state before processing any input)
h_t = np.zeros((batch_size, num_hidden_units))

# Define the learning rate
learning_rate = 0.001

# Loop through the input sequence and update the hidden state at each time step
for i in range(0, len(data) - num_time_steps, batch_size):
    # Extract the input sequence for the current batch
    x_batch = np.zeros((batch_size, num_time_steps, num_input_features))
    for j in range(batch_size):
        x_batch[j,:,:] = data[i+j:i+j+num_time_steps].reshape((num_time_steps, num_input_features))
    
    # Compute the new hidden states for the current batch using the current inputs and previous hidden states
    h_t = np.zeros((batch_size, num_hidden_units))
    for t in range(num_time_steps):
        x_t = x_batch[:,t,:]
        h_t = np.tanh(np.dot(x_t, W_xh) + np.dot(h_t, W_hh) + b_h)
    
    # Compute the outputs for the current batch at the last time step
    y_pred = np.dot(h_t, W_hy) + b_y
    
    # Compute the target values for the current batch (in this case, we want to predict the next day's sales based on the previous 3 days of sales)
    y_true = data[i+num_time_steps:i+num_time_steps+batch_size].reshape((batch_size, num_input_features))
    
    # Compute the error (mean squared error) for the current batch
    error = 0.5 * np.mean((y_pred - y_true) ** 2)
    
    # Compute the gradients of the output layer for the current batch (using the chain rule)
    grad_y = (y_pred - y_true) / batch_size
    grad_W_hy = np.dot(h_t.T, grad_y)
    grad_b_y = np.sum(grad_y, axis=0, keepdims=True)
    
    # Initialize the gradients of the hidden state for the current batch (this will be used as the initial gradients for backpropagation)
    grad_h = np.zeros((batch_size, num_hidden_units))
    
    # Loop backward through the time steps and compute the gradients for each time step for the current batch
    for t in reversed(range(num_time_steps)):
        x_t = x_batch[:,t,:]
        h_t = np.tanh(np.dot(x_t, W_xh) + np.dot(h_t, W_hh) + b_h)
        
        # Compute the gradients for the output of the RNN layer (using the chain rule)
        grad_output = grad_h + np.dot(grad_y, W_hy.T)
        grad_z = grad_output * (1 - h_t ** 2)
        
        # Compute the gradients for the parameters of the RNN layer (using the chain rule)
        grad_W_xh = np.dot(x_t.T, grad_z)
        grad_W_hh = np.dot(h_t.T, grad_z)
        grad_b_h = np.sum(grad_z, axis=0, keepdims=True)
        
        # Update the gradients for the next time step (using the chain rule)
        grad_h = np.dot(grad_z, W_hh.T)
        
        # Accumulate the gradients for the current batch
        if t == num_time_steps - 1:
            total_grad_W_xh = grad_W_xh
            total_grad_W_hh = grad_W_hh
            total_grad_b_h = grad_b_h
            total_grad_W_hy = grad_W_hy
            total_grad_b_y = grad_b_y
        else:
            total_grad_W_xh += grad_W_xh
            total_grad_W_hh += grad_W_hh
            total_grad_b_h += grad_b_h
            total_grad_W_hy += grad_W_hy
            total_grad_b_y += grad_b_y

    # Update the parameters of the RNN layer for the current batch using the computed gradients and the learning rate
    W_xh -= learning_rate * total_grad_W_xh
    W_hh -= learning_rate * total_grad_W_hh
    b_h -= learning_rate * total_grad_b_h
    W_hy -= learning_rate * total_grad_W_hy
    b_y -= learning_rate * total_grad_b_y
    print("Error:", error)

Error: 6845.298336578325
Error: 5806.473069099888
Error: 5643.544407327385
