# Linear Regression with Numpy
We compute the regression line linking counts of letters in a text to counts of letter a. 

Author: Pierre Nugues

## Modules
We import the modules we need to run the program

In [None]:
import random
import numpy as np
import datasets
import matplotlib.pyplot as plt

## Function to compute the sum of squared errors

In [None]:
def sse(X, y, w):
    error = y - X @ w
    return error.T @ error

## Function to normalize the data

In [None]:
def normalize(Xy):
    maxima = np.amax(Xy, axis=0)
    D = np.diag(maxima)
    D_inv = np.linalg.inv(D)
    Xy = Xy @ D_inv
    return (Xy, maxima)

## Stochastic descent
Function to apply a stochastic descent

In [None]:
def stoch_descent(X, y, alpha, w):
    """
    Stochastic gradient descent
    :param X:
    :param y:
    :param alpha:
    :param w:
    :return:
    """
    global logs, logs_stoch
    logs = []
    logs_stoch = []
    random.seed(0)
    idx = list(range(len(X)))
    for epoch in range(500):
        random.shuffle(idx)
        w_old = w
        for i in idx:
            loss = y[i] - X[i] @ w
            gradient = loss * X[i].reshape(-1, 1)
            w = w + alpha * gradient
            logs_stoch += (w, alpha, sse(X, y, w))
        if np.linalg.norm(w - w_old) / np.linalg.norm(w) < 0.005:
            print("Epoch", epoch)
            break
        logs += (w, alpha, sse(X, y, w))
    return w

## Batch descent
Function to apply a batch descent

In [None]:
def batch_descent(X, y, alpha, w):
    """
    Batch gradient descent
    :param X:
    :param y:
    :param alpha:
    :param w:
    :return:
    """
    global logs
    logs = []
    alpha /= len(X)
    for epoch in range(1, 500):
        loss = y - X @ w
        gradient = X.T @ loss
        w_old = w
        w = w + alpha * gradient
        logs += (w, alpha, sse(X, y, w))
        if np.linalg.norm(w - w_old) / np.linalg.norm(w) < 0.0005:
            print("Epoch", epoch)
            break
    return w

## We load the data and we possibly normalize it

In [None]:
normalized = True
debug = False
X, y = datasets.load_tsv(
    'https://raw.githubusercontent.com/pnugues/ilppp/master/programs/ch04/salammbo/salammbo_a_en.tsv')
# Predictors
X = np.array(X)
# Response
y = np.array([y]).T

alpha = 1.0e-10
if normalized:
    X, maxima_X = normalize(X)
    y, maxima_y = normalize(y)
    maxima = np.concatenate((maxima_X, maxima_y))
    alpha = 1.0
    print("-Normalized-")

## We apply a batch descent

In [None]:
print("===Batch descent===")
w = np.zeros(X.shape[1]).reshape((-1, 1))
w = batch_descent(X, y, alpha, w)
print("Weights", w)
print("SSE", sse(X, y, w))
if normalized:
    maxima = maxima.reshape(-1, 1)
    w = maxima[-1, 0] * (w / maxima[:-1, 0:1])
    print("Restored weights", w)
if debug:
    print("Logs", logs)

## We restore the coordinates

In [None]:
x_fig = [X[i][1] * maxima_X[1] for i in range(len(X))]
y_fig = [yi * maxima_y for yi in y]

## We plot the coordinates and the line

In [None]:
plt.scatter(x_fig, y_fig)
plt.plot([min(x_fig), max(x_fig)],
         [[1, min(x_fig)] @ w, [1, max(x_fig)] @ w])
plt.show()

## Showing the iterations

### The errors

In [None]:
plt.scatter(range(len(logs[2::3])), logs[2::3], c='b', marker='x')
plt.title("Batch gradient descent: Sum of squared errors")
plt.show()

In [None]:
plt.plot(list(map(lambda pair: pair[0], logs[0::3])), list(map(lambda pair: pair[1], logs[0::3])), marker='o')
for i in range(len(logs[0::3])):
    plt.annotate(i, xy=logs[0::3][i])
plt.title("Batch gradient descent: Weights")
plt.show()

## We apply a stochastic descent

In [None]:
print("===Stochastic descent===")
w = np.zeros(X.shape[1]).reshape((-1, 1))
w = stoch_descent(X, y, alpha, w)
print("Weights", w)
print("SSE", sse(X, y, w))
if normalized:
    maxima = maxima.reshape(-1, 1)
    w = maxima[-1, 0] * (w / maxima[:-1, 0:1])
    print("Restored weights", w)
if debug:
    print("Logs", logs)
    print("Logs stoch.", logs_stoch)

## We plot the line

In [None]:
plt.scatter(x_fig, y_fig)
plt.plot([min(x_fig), max(x_fig)],
         [[1, min(x_fig)] @ w, [1, max(x_fig)] @ w])
plt.show()

## Showing the iterations

### The errors by epoch

In [None]:
plt.scatter(range(len(logs[2::3])), logs[2::3], c='b', marker='x')
plt.title("Stochastic gradient descent: Sum of squared errors")
plt.show()

### The weight updates by epoch

In [None]:
plt.plot(list(map(lambda pair: pair[0], logs[0::3])), list(map(lambda pair: pair[1], logs[0::3])), marker='o')
plt.title("Stochastic gradient descent: Weights")
plt.show()

### The weight updates

In [None]:
plt.scatter(range(len(logs_stoch[2::3])), logs_stoch[2::3], c='b', marker='x')
plt.title("Stochastic gradient descent: Sum of squared errors (individual updates)")
plt.show()

### The weight updates by individual update

In [None]:
plt.plot(list(map(lambda pair: pair[0], logs_stoch[0::3])), list(map(lambda pair: pair[1], logs_stoch[0::3])),
         marker='o')
plt.title("Stochastic gradient descent: Weights (individual updates)")
plt.show()