# Gradient Descent for Linear Regression

Automating the process of optimizing $m$ and $b$ using gradient descent.

$$ \min_{m, b} J(m, b) $$

In [None]:
import math, copy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('./deeplearning.mplstyle')

In [None]:
# Load a simple data set
x_train = np.array([2, 4, 6, 8, 10])   #features
y_train = np.array([1, 3, 5, 7, 9])   #target value


#### Cost Function
The cost function is a measure of how well the model fits the data. For linear regression, we typically use the Mean Squared Error (MSE) as the cost function:
$$ J(m, b) = \frac{1}{n} \sum_{i=1}^{n} (y_i - (mx_i + b))^2 $$


In [None]:
def compute_cost(x, y, m, b): 
    # number of training examples
    n = x.shape[0] 
    cost_sum = 0 
    for i in range(n): 
        f_mb = m * x[i] + b   
        cost = (f_mb - y[i]) ** 2  
        cost_sum = cost_sum + cost  
    total_cost = (1 / (2 * n)) * cost_sum  
    return total_cost

#### Compute Gradient Function
The gradient is the vector of partial derivatives of the cost function with respect to each parameter. For linear regression, we have:
$$ \nabla J(m, b) = \begin{bmatrix} \frac{\partial J}{\partial m} \\ \frac{\partial J}{\partial b} \end{bmatrix} $$

In [None]:
def compute_gradient(x, y, m, b): 
    """
    Computes the gradient for linear regression 
    Args:
      x (ndarray (m,)): Data, m examples 
      y (ndarray (m,)): target values
      m,b (scalar)    : model parameters  
    Returns
      dj_dm (scalar): The gradient of the cost m.r.t. the parameters m
      dj_db (scalar): The gradient of the cost m.r.t. the parameter b     
     """
    # Number of training examples
    n = x.shape[0]    
    dj_dm = 0
    dj_db = 0
    
    for i in range(n):  
        f_mb = m * x[i] + b 
        dj_dm_i = (f_mb - y[i]) * x[i] 
        dj_db_i = f_mb - y[i] 
        dj_db += dj_db_i
        dj_dm += dj_dm_i 
    dj_dm = dj_dm / n 
    dj_db = dj_db / n 
        
    return dj_dm, dj_db

####  Gradient Descent Function
The gradient descent algorithm updates the parameters in the direction of the negative gradient:
$$ \begin{bmatrix} m \\ b \end{bmatrix} = \begin{bmatrix} m \\ b \end{bmatrix} - \alpha \nabla J(m, b) $$

Where $\alpha$ is the learning rate.

##### Learning Rate
The learning rate is a hyperparameter that controls how much to change the model parameters in response to the estimated gradient. A small learning rate means the model will learn slowly, while a large learning rate can cause the model to converge too quickly to a suboptimal solution or even diverge.

##### Stopping Criteria
The stopping criteria for gradient descent can be based on:
- A maximum number of iterations
- A threshold for the change in the cost function
- A threshold for the change in the parameters
- A threshold for the gradient

In [None]:
def gradient_descent(x, y, m_in, b_in, alpha, num_iters, cost_function, gradient_function): 
    """
    Performs gradient descent to fit m,b. Updates m,b by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      x (ndarray (m,))  : Data, m examples 
      y (ndarray (m,))  : target values
      m_in,b_in (scalar): initial values of model parameters  
      alpha (float):     Learning rate
      num_iters (int):   number of iterations to run gradient descent
      cost_function:     function to call to produce cost
      gradient_function: function to call to produce gradient
      
    Returns:
      m (scalar): Updated value of parameter after running gradient descent
      b (scalar): Updated value of parameter after running gradient descent
      J_history (List): History of cost values
      p_history (list): History of parameters [m,b] 
      """
    
    # An array to store cost J and m's at each iteration primarily for graphing later
    J_history = []
    p_history = []
    b = b_in
    m = m_in
    data = pd.DataFrame(
      {
        "Iteration": ["Initial"],
        "Cost": [f"{cost_function(x, y, m , b):.8f}"],
        "m derivative": [""],
        "b derivative": [""],
        "m": [m_in],
        "b": [b_in],
      },
      )
    for i in range(num_iters):
        # Calculate the gradient and update the parameters using gradient_function
        dj_dm, dj_db = gradient_function(x, y, m , b)     

        # Update Parameters using equation (3) above
        b = b - alpha * dj_db                            
        m = m - alpha * dj_dm                            

        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( cost_function(x, y, m , b))
            p_history.append([m,b])
        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters/10) == 0:       
            new_row = pd.DataFrame(
            {
              "Iteration": [i],
              "Cost": [f"{J_history[-1]:.8f}"],
              "m derivative": [f"{dj_dm:.8f}"],
              "b derivative": [f"{dj_db:.8f}"],
              "m": [m],
              "b": [b],
            },
            )
            data = pd.concat([data, new_row])
    new_row = pd.DataFrame(
      {
        "Iteration": ["Final"],
        "Cost": [f"{J_history[-1]:.8f}"],
        "m derivative": [f"{dj_dm:.8f}"],
        "b derivative": [f"{dj_db:.8f}"],
        "m": [m],
        "b": [b],
      },
      )
    data = pd.concat([data, new_row])
    return m, b, J_history, p_history, data #return mb and J,m history for graphing

#### Set Gradient Descent Parameters

In [None]:
m_init = 0  #Starting m parameter
b_init = 0  #Starting b parameter
iterations = 10000  #Stopping criteria
tmp_alpha = 0.010   #Learn rate

#### Run Gradient Descent And Analyze The Results

In [None]:
m_final, b_final, J_hist, p_hist, data = gradient_descent(x_train ,y_train, m_init, b_init, tmp_alpha, iterations, compute_cost, compute_gradient) 

print(data)
print(f"(m,b) found by gradient descent: ({m_final:8.4f},{b_final:8.4f})")

#### Plot The Optimal Parameters Discovered By Gradient Descent

In [None]:
tmp_f_mb = compute_model_output(x_train, m_final, b_final)

# Plot
plt.plot(x_train, tmp_f_mb, c='b',label='Our Prediction')
plt.scatter(x_train, y_train, marker='x', c='r',label='Actual Values')
plt.title('Apply the m,b found by gradient descent')
plt.ylabel('Target')
plt.xlabel('Feature')
plt.legend()
plt.show()