<a href="https://colab.research.google.com/github/rallm/IUST-DL-Fall2025/blob/main/HW2/helper/hw2_q3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# Set print options for better readability
np.set_printoptions(precision=4, suppress=True)

In [3]:
# W1: Weights from input (3 features) to hidden (4 neurons)
# Shape (3, 4)
W1 = np.array([
    [ 2.5, -1.2,  0.8,  3.1],  # Weights from x1
    [-0.5,  2.8, -1.5,  0.3],  # Weights from x2
    [ 1.7, -0.9,  2.3, -1.8]   # Weights from x3
])

In [4]:
# b1: Biases for hidden layer (4 neurons)
# Shape (1, 4)
b1 = np.array([[0.5, -0.3, 0.2, 0.7]])

In [5]:
# W2: Weights from hidden (4 neurons) to output (2 classes)
# Shape (4, 2)
W2 = np.array([
    [ 1.5, -2.3],  # Weights from h1
    [-0.8,  1.9],  # Weights from h2
    [ 2.1, -0.6],  # Weights from h3
    [-1.3,  2.7]   # Weights from h4
])

In [6]:
# b2: Biases for output layer (2 classes)
# Shape (1, 2)
b2 = np.array([[0.3, -0.5]])

In [7]:
# x: Input sample
# Shape (1, 3)
x = np.array([[1.0, 0.5, 2.0]])

In [8]:
# y_true: True label (Class 0: Sick)
# Shape (1, 2)
y_true = np.array([[1.0, 0.0]])

# Part A

In [10]:
# Calculate hidden layer linear combination (z = x * W1 + b1)
# (1, 3) @ (3, 4) -> (1, 4)
z = np.dot(x, W1) + b1

In [11]:
z

array([[ 6.15, -1.9 ,  4.85,  0.35]])

In [12]:
def relu(x):
    """ReLU activation function."""
    return np.maximum(0, x)

In [13]:
h = relu(z)

In [14]:
h

array([[6.15, 0.  , 4.85, 0.35]])

In [15]:
# Calculate output layer linear combination (z2 = h * W2 + b2)
# (1, 4) @ (4, 2) -> (1, 2)
z2 = np.dot(h, W2) + b2

In [16]:
z2

array([[ 19.255, -16.61 ]])

In [17]:
def softmax(x):
    """Softmax activation function."""
    # Subtract max for numerical stability
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [18]:
# Calculate final output (y_hat = softmax(z2))
# (1, 2)
y_hat = softmax(z2)

In [19]:
y_hat

array([[1., 0.]])

In [20]:
def cross_entropy_loss(y_true, y_pred):
    """Calculates the categorical cross-entropy loss."""
    # Add a small epsilon to avoid log(0)
    epsilon = 1e-9
    # Clip predictions to avoid extreme values
    y_pred_clipped = np.clip(y_pred, epsilon, 1. - epsilon)

    # Calculate loss
    loss = -np.sum(y_true * np.log(y_pred_clipped))
    return loss

In [21]:
L_data = cross_entropy_loss(y_true, y_hat)

In [24]:
print(f"{L_data:.6f}")

0.000000


# Part B

In [25]:
# Idea A: L2 Regularization (Weight Decay): L_new = L_data + 0.01 * Σ(w^2)
lambda_l2 = 0.01

# Calculate L2 penalty term
# Sum of squares of all weights in W1 and W2
penalty_l2_W1 = np.sum(np.square(W1))
penalty_l2_W2 = np.sum(np.square(W2))
total_penalty_l2_term = penalty_l2_W1 + penalty_l2_W2
total_penalty_l2 = lambda_l2 * total_penalty_l2_term

In [29]:
total_penalty_l2_term

np.float64(66.14)

In [26]:
total_penalty_l2

np.float64(0.6614)

In [27]:
L_new_l2 = L_data + total_penalty_l2

In [28]:
L_new_l2

np.float64(0.661400001)

In [30]:
# Idea B: L1 Regularization (Lasso): L_new = L_data + 0.005 * Σ|w| ---")
lambda_l1 = 0.005

In [31]:
# Calculate L1 penalty term
# Sum of absolute values of all weights in W1 and W2
penalty_l1_W1 = np.sum(np.abs(W1))
penalty_l1_W2 = np.sum(np.abs(W2))
total_penalty_l1_term = penalty_l1_W1 + penalty_l1_W2
total_penalty_l1 = lambda_l1 * total_penalty_l1_term

In [32]:
total_penalty_l1_term

np.float64(32.599999999999994)

In [33]:
total_penalty_l1

np.float64(0.16299999999999998)

In [34]:
# Calculate total loss with L1
L_new_l1 = L_data + total_penalty_l1

In [35]:
L_new_l1

np.float64(0.16300000099999995)

In [36]:
# Calculate total gradient for W2[0,0]
# W2[0,0] corresponds to W2[0, 0] in numpy (value is 1.5)
grad_L_data_wrt_W2_0_0 = -0.3
W2_0_0 = W2[0, 0]

# Gradient of L2 penalty: d/dw (lambda * w^2) = 2 * lambda * w
grad_L2_penalty_wrt_W2_0_0 = 2 * lambda_l2 * W2_0_0
print(f"Gradient of L2 penalty w.r.t W2[0,0]: 2 * {lambda_l2} * {W2_0_0} = {grad_L2_penalty_wrt_W2_0_0:.6f}")

Gradient of L2 penalty w.r.t W2[0,0]: 2 * 0.01 * 1.5 = 0.030000


In [37]:
# Total gradient = gradient from data + gradient from penalty
grad_total_l2 = grad_L_data_wrt_W2_0_0 + grad_L2_penalty_wrt_W2_0_0
print(f"Total Gradient (L2) for W2[0,0]: {grad_L_data_wrt_W2_0_0} + {grad_L2_penalty_wrt_W2_0_0:.6f} = {grad_total_l2:.6f}\n")

Total Gradient (L2) for W2[0,0]: -0.3 + 0.030000 = -0.270000



In [38]:
# Calculate total gradient for W2[0,0]
# Gradient of L1 penalty: d/dw (lambda * |w|) = lambda * sign(w)
# sign(1.5) is 1
grad_L1_penalty_wrt_W2_0_0 = lambda_l1 * np.sign(W2_0_0)
print(f"Gradient of L1 penalty w.r.t W2[0,0]: {lambda_l1} * sign({W2_0_0}) = {grad_L1_penalty_wrt_W2_0_0:.6f}")

Gradient of L1 penalty w.r.t W2[0,0]: 0.005 * sign(1.5) = 0.005000


In [39]:
# Total gradient = gradient from data + gradient from penalty
grad_total_l1 = grad_L_data_wrt_W2_0_0 + grad_L1_penalty_wrt_W2_0_0
print(f"Total Gradient (L1) for W2[0,0]: {grad_L_data_wrt_W2_0_0} + {grad_L1_penalty_wrt_W2_0_0:.6f} = {grad_total_l1:.6f}\n")

Total Gradient (L1) for W2[0,0]: -0.3 + 0.005000 = -0.295000

