In [10]:
import numpy as np

In [11]:
# Define X
X = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)

# Generate y using the function
y = np.array([x[0] * 0.5 + 2 * x[1] for x in X])
y_pred = np.array([5., 10., 15.])

y = np.reshape(y, shape=(-1, 1))
y_pred = np.reshape(y_pred, shape=(-1, 1))

print("X:\n", X)
print("\ny:\n", y)
print("\ny_pred:\n", y_pred)

X:
 [[1. 2.]
 [3. 4.]
 [5. 6.]]

y:
 [[ 4.5]
 [ 9.5]
 [14.5]]

y_pred:
 [[ 5.]
 [10.]
 [15.]]


In [12]:
errors = y_pred - y
errors

array([[0.5],
       [0.5],
       [0.5]])

In [13]:
def relu_derivative(X):
    X = np.asarray(X)
    return (X > 0).astype(np.float32)


def get_gradient(y_pred, y, X, z):
    """
    Parameters:
    - y_pred (numpy.ndarray): Predicted values, shape [n_samples, 1].
    - y (numpy.ndarray): True values, shape [n_samples, 1].
    - X (numpy.ndarray): Input features, shape [n_samples, n_features].
    - z (numpy.ndarray): Linear combination of inputs and weights (X * W.T + b), shape [n_samples, 1].
    """
    errors = y_pred - y                 # [n_samples, 1]
    loss_gradient = 2 / len(X) * errors # Gradient of MSE wrt prediction: [n_samples, 1]
    relu_mask = relu_derivative(z)
    upstream_gradient = loss_gradient * relu_mask  # Contains Stop gradients where ReLU is zero: [n_samples, 1]
    
    
    w_gradient = upstream_gradient.T @ X # [n_units, n_inputs ]
    b_gradient = np.sum(upstream_gradient)
    return w_gradient, b_gradient

In [14]:
get_gradient(y_pred, y, X, y_pred)

(array([[3., 4.]]), np.float64(1.0))

In [15]:
dL_dy_hat = 2/len(X) * errors
dL_dy_hat.T

array([[0.33333333, 0.33333333, 0.33333333]])

In [16]:
X

array([[1., 2.],
       [3., 4.],
       [5., 6.]], dtype=float32)

In [17]:
dL_dy_hat.T @ X

array([[3., 4.]])