In [2]:
#Adaptive Moment Estimation (Adam's optimizer)

import numpy as np

class Adam:
    def __init__(self, params, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):

            #params (dict): Dictionary of parameters to optimize. For example: {'W': weight_matrix, 'b': bias_vector}.
            #lr (float): Learning rate.
            #beta1 (float): Exponential decay rate for the first moment estimates.
            #beta2 (float): Exponential decay rate for the second moment estimates.
            #epsilon (float): A small constant to prevent division by zero.

        self.params = params
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.t = 0  # Time step

        # Initialize first (m) and second (v) moment variables for each parameter as zeros.
        self.m = {key: np.zeros_like(value) for key, value in params.items()}
        self.v = {key: np.zeros_like(value) for key, value in params.items()}

    def step(self, grads):

        #Performs a single update step using the Adam update rule.

        #Parameters:
            #grads (dict): Dictionary of gradients corresponding to each parameter.

        self.t += 1
        for key in self.params.keys():
            # Update biased first moment estimate.
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads[key]
            # Update biased second moment estimate.
            self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (grads[key] ** 2)
            # Compute bias-corrected estimates.
            m_hat = self.m[key] / (1 - self.beta1 ** self.t)
            v_hat = self.v[key] / (1 - self.beta2 ** self.t)
            # Update parameters.
            self.params[key] -= self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)

# --- Example Usage: Minimizing f(x) = x^2 ---
# Our goal is to find the minimum of f(x) = x^2, whose gradient is 2*x.
params = {'x': np.array([5.0])}  # Start with x = 5.0
optimizer = Adam(params, lr=0.1)  # Set a learning rate of 0.1

# Dummy training loop.
for i in range(100):
    # Compute gradient of f(x) = x^2, which is 2*x.
    grads = {'x': 2 * params['x']}

    # Update the parameter using Adam.
    optimizer.step(grads)

    # Compute current loss.
    loss = params['x'] ** 2
    if i % 10 == 0:
        print(f"Iteration {i}: x = {params['x'][0]:.4f}, Loss = {loss[0]:.4f}")




Iteration 0: x = 4.9000, Loss = 24.0100
Iteration 10: x = 3.9101, Loss = 15.2888
Iteration 20: x = 2.9699, Loss = 8.8201
Iteration 30: x = 2.1244, Loss = 4.5130
Iteration 40: x = 1.4112, Loss = 1.9913
Iteration 50: x = 0.8524, Loss = 0.7267
Iteration 60: x = 0.4512, Loss = 0.2036
Iteration 70: x = 0.1917, Loss = 0.0368
Iteration 80: x = 0.0455, Loss = 0.0021
Iteration 90: x = -0.0210, Loss = 0.0004
