In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

In [2]:
import numpy as np
import pandas as pd

# Define the data
data = {
    "Size (sq ft)": [850, 900, 1200, 1400, 1600, 1700, 1800, 2000, 2200, 2500],
    "Bedrooms": [2, 3, 3, 3, 3, 4, 4, 4, 5, 5],
    "Price ($)": [300000, 340000, 400000, 500000, 520000, 580000, 600000, 620000, 720000, 790000]
}

df = pd.DataFrame(data)

X = df[["Size (sq ft)", "Bedrooms"]].values
y = df["Price ($)"].values

X.shape, y.shape

((10, 2), (10,))

In [3]:
from sklearn.preprocessing import StandardScaler

X = X.astype(np.float64)
y = y.astype(np.float64)

# scaler_X = StandardScaler()
# X = scaler_X.fit_transform(X)
# scaler_y = StandardScaler()
# y = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()  # type: ignore
input_dim = X.shape[1]
hidden_dim = (10, 5)
output_dim = 1
n_samples = X.shape[0]

learning_rate = 0.1

input_dim, hidden_dim, output_dim

(2, (10, 5), 1)

In [4]:
coefs_ = []
intercepts_ = []

if not hasattr(hidden_dim, "__iter__"):
    hidden_dim = [hidden_dim]
hidden_dim = list(hidden_dim)

layer_units = [input_dim, *hidden_dim, output_dim]
n_layers_ = len(layer_units)

for i in range(n_layers_ - 1):
    coef_init = np.random.randn(layer_units[i], layer_units[i + 1])
    intercept_init = np.zeros(layer_units[i+1])
    coefs_.append(coef_init)
    intercepts_.append(intercept_init)

layer_units, n_layers_, [c.shape for c in coefs_], [
    i.shape for i in intercepts_]

([2, 10, 5, 1], 4, [(2, 10), (10, 5), (5, 1)], [(10,), (5,), (1,)])

In [5]:
coefs_[-1], intercepts_[-1]

(array([[ 0.36139561],
        [ 1.53803657],
        [-0.03582604],
        [ 1.56464366],
        [-2.6197451 ]]),
 array([0.]))

In [6]:
coef_grads = [
    np.empty((n_fan_in_, n_fan_out_))
    for n_fan_in_, n_fan_out_ in zip(layer_units[:-1], layer_units[1:])
]
intercept_grads = [
    np.empty(n_fan_out_) for n_fan_out_ in layer_units[1:]
]

[c.shape for c in coef_grads], [i.shape for i in intercept_grads]

([(2, 10), (10, 5), (5, 1)], [(10,), (5,), (1,)])

In [7]:
def _forward(X):
    activations = [X] * n_layers_
    # Compute hidden layer activations
    for i in range(n_layers_ - 1):
        activations[i+1] = np.dot(activations[i], coefs_[i])
        activations[i+1] += intercepts_[i]
        if i+1 != n_layers_-1:
            # Tanh activation for hidden layers
            activations[i+1] = np.tanh(activations[i+1])

    return activations


activations = _forward(X)
[a.shape for a in activations], activations[-1]

([(10, 2), (10, 10), (10, 5), (10, 1)],
 array([[-1.72428475],
        [-1.72428475],
        [-1.72428475],
        [-1.72428475],
        [-1.72428475],
        [-1.72428475],
        [-1.72428475],
        [-1.72428475],
        [-1.72428475],
        [-1.72428475]]))

In [8]:
# loss = activations[-1] - y.reshape(-1, 1)

# # Compute gradients for the output layer
# coef_grads[-1] = np.dot(activations[-2].T, loss) / y.shape[0]
# intercept_grads[-1] = np.mean(loss, axis=0)

# # Compute the gradients for the hidden layers
# for i in range(n_layers_ - 2, 0, -1):
#     loss = np.dot(loss, coefs_[i].T) * (1 - np.tanh(activations[i])**2)
#     coef_grads[i-1] = np.dot(activations[i-1].T, loss)
#     intercept_grads[i-1] = np.mean(loss, axis=0)

loss = activations[-1] - y.reshape(-1, 1)

# Compute the gradients for the hidden layers
for i in range(n_layers_ - 1, 0, -1):
    print(i)
    if i != n_layers_ - 1:
        loss = np.dot(loss, coefs_[i].T) * (1 - np.tanh(activations[i])**2)
    coef_grads[i - 1] = np.dot(
        activations[i-1].T, loss) / y.shape[0]
    intercept_grads[i-1] = np.mean(loss, axis=0)

[c.shape for c in coefs_], [i.shape for i in intercepts_]

3
2
1


([(2, 10), (10, 5), (5, 1)], [(10,), (5,), (1,)])

In [11]:
# Gradient clipping to prevent exploding gradients
max_grad_norm = 1.0
for i in range(n_layers_-1):
    coef_grads[i] = np.clip(
        coef_grads[i], -max_grad_norm, max_grad_norm)
    intercept_grads[i] = np.clip(
        intercept_grads[i], -max_grad_norm, max_grad_norm)

In [12]:
# Update weights and biases using gradient descent
for i in range(n_layers_-1):
    coefs_[i] -= learning_rate * coef_grads[i]
    intercepts_[i] -= learning_rate * intercept_grads[i]