# Linear Regression from Scratch in Python 

In [22]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Defining Functions

### Problem 2. In the *loss* function, the prediction error (residual) is evaluated for each observation. 
1. What value is `y_pred` assigned right before `res[i] = y[i] - y_pred` is executed?
2. In the *loss* function defined below, what would `np.dot(X, w)+b` return?

**Your Answers:** *fill in here*

In [23]:
# loss function computes and returns SSE(the squared sum of the residuals) divided by the number of observations and the residuals
def loss(y, X, w, b):
    """ Unvectorized version. """
    loss = 0
    res = np.zeros(X.shape[0])
    for i in range(X.shape[0]):
        # evaluate the predicted value of y for each observation
        y_pred = b
        for j in range(X.shape[1]):
            y_pred += w[j] * X[i][j]
        res[i] = y[i] - y_pred
        loss += res[i] ** 2.0
    return loss/X.shape[0], res

In [24]:
# loss_vectorized function computes and returns SSE(the squared sum of the residuals) divided by the number of observations and the residuals
def loss_vectorized(y, X, w, b):
    """ Vectorized version. """
    loss = 0
    res = np.zeros(X.shape[0])
    y_pred = np.dot(X, w) + b
    res = y - y_pred
    loss = np.sum(np.square(y - y_pred))
    return loss/X.shape[0], res

### Problem 3. In the *grd* function, the gradient of the loss function in terms of `w` and `b` is returned.
1. What is the shape of `dw`? 
2. Is `db` an array or a scalar?
3. In the grd function defined below, what would `-2 * np.dot(X.T, res) / X.shape[0]` return?
4. In the grd function defined below, what would `-2 * np.sum(res) / X.shape[0]` return?

**Your Answers:** *fill in here*

In [25]:
# grd function computes and returns the gradient (dw, db)
def grd(X, w, b, res):
    """ Unvectorized version. """
    # dw contains the partial derivatives of the loss to w
    dw = np.zeros_like(w)
    # db contains the derivative of the loss to b
    db = 0
    for j in range(X.shape[1]):
        for i in range(X.shape[0]):
            dw[j] -= 2 * res[i] * X[i][j] 
    for i in range(X.shape[0]):
        db -= 2 * res[i]
    return dw/X.shape[0], db/X.shape[0]

In [26]:
# grd_vectorized function computes and returns the gradient (dw, db)
def grd_vectorized(X, w, b, res):
    """ Vectorized version. """
    # dw contains the partial derivatives of the loss to w
    dw = np.zeros_like(w)
    # db contains the derivative of the loss to b
    db = 0
    dw -= 2 * np.dot(X.T, res)
    db -= 2 * np.sum(res)
    return dw/X.shape[0], db/X.shape[0]

### Problem 4. In the *grd_descent* function, gradient descent is applied to update parameters `w` and `b`.
1. What does argument `iter` represent? 
2. Is `db` an array or a scalar?
3. In the grd_descent function defined below, what would `w -= lr * dw` return?

**Your Answers:** *fill in here*

In [27]:
# grd_descent function applies gradient descent for iter interations
def grd_descent(y, X, w, b, lr = 1.0e-5, iter = 100):
    """ Unvectorized version. """
    for t in range(iter):
        cost, res = loss(y, X, w, b)
        print("Iteration %d: Loss = %.4f" %(t+1, cost))
        dw, db = grd(X, w, b, res)
        for j in range(X.shape[1]):
            w[j] -= lr * dw[j]
        b -= lr * db
    return w, b, cost

In [28]:
# grd_descent_vectorized function applies gradient descent for iter interations
def grd_descent_vectorized(y, X, w, b, lr = 1.0e-5, iter = 100):
    """ Vectorized version. """
    for t in range(iter):
        cost, res = loss_vectorized(y, X, w, b)
        print("Iteration %d: Loss = %.4f" %(t+1, cost))
        dw, db = grd_vectorized(X, w, b, res)
        w -= lr * dw
        b -= lr * db
    return w, b, cost

## Training the Linear Regression Model

In [29]:
# dataset, 2 features and 5 observations
x1 = np.array([48, 62, 79, 76, 59])
x2 = np.array([68, 81, 80, 83, 64])
# y is a vector containing the values of the dependent variable
y = np.array([63, 72, 78, 79, 62])
# X is a 5 by 2 matrix containing the values of the features(independent variables) 
X = np.array([x1, x2]).T
print("Dataset Size:")
print("Number of observations: %3d" %X.shape[0])
print("Number of features: %2d" %X.shape[1])

Dataset Size:
Number of observations:   5
Number of features:  2


### Problem 1. In the next code cell, Numpy arrays `X` and `y` constructed in the last code cell are used to train the linear regression model. Click the *Cell* tab, and select *Run All* to train the model. You shall see that loss (SSE, the squared sum of the residuals) is decreasing, which means that the parameters `w` and `b` are updated iteratively to reduce the prediction errors.
1. What does `X.shape[0]` return?
2. What does `X.shape[1]` return?
3. Examine the last statement in the next code cell. Interpret root mean square error (RMSE).

**Your Answers:** *fill in here*

In [30]:
# train the model
# initialization; w is a vector containing the coefficients; b is a scalar equal to the bias
w = np.ones(X.shape[1]) * 1.0e-3
b = 0
# learning rate
lr = 1.0e-5
# call the grd_descent function to learn the parameters
w, b, loss = ols(y, X, w, b, lr, iter = 5)
print('Returned parameters:')
print('Coefficients: ', w)
print('Bias: ', b)
print('Regression Equation:')
print('y = %.4f ' %b, end = '')
for i in range(X.shape[1]):
    print('+ %.4fX%-2d' %(w[i], i+1), end = '')
print('\nRoot Mean Square Error(RMSE): %.4f' %(np.sqrt(loss)))

Iteration 1: Loss = 5044.3427
Iteration 2: Loss = 3228.4750
Iteration 3: Loss = 2066.8351
Iteration 4: Loss = 1323.7147
Iteration 5: Loss = 848.3278
Returned parameters:
Coefficients:  [0.31349735 0.36181406]
Bias:  0.004755988622235966
Regression Equation:
y = 0.0048 + 0.3135X1 + 0.3618X2 
Root Mean Square Error(RMSE): 29.1261


In [32]:
# train the model
# initialization; w is a vector containing the coefficients; b is a scalar equal to the bias
w = np.ones(X.shape[1]) * 1.0e-3
b = 0
# learning rate
lr = 1.0e-5
# call the grd_gradient_vectorized function to learn the parameters
w, b, loss = grd_descent_vectorized(y, X, w, b, lr, iter = 5)
print(w, b)
print('Returned parameters:')
print('Coefficients: ', w)
print('Bias: ', b)
print('Regression Equation:')
print('y = %.4f ' %b, end = '')
for i in range(X.shape[1]):
    print('+ %.4fX%-2d' %(w[i], i+1), end = '')
print('\nRoot Mean Squared Error: %f' %(np.sqrt(loss)))

Iteration 1: Loss = 5044.3427
Iteration 2: Loss = 3228.4750
Iteration 3: Loss = 2066.8351
Iteration 4: Loss = 1323.7147
Iteration 5: Loss = 848.3278
[0.31349735 0.36181406] 0.004755988622235966
Returned parameters:
Coefficients:  [0.31349735 0.36181406]
Bias:  0.004755988622235966
Regression Equation:
y = 0.0048 + 0.3135X1 + 0.3618X2 
Root Mean Squared Error: 29.126068
