## Analytical Regression
Pierre Nugues

### The modules

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

### The Functions

#### Pseudo-inverse with a numpy Matrix

In [None]:
def regression_matrix(X, y, reg=0.0):
    """
    Computes the regression using numpy matrices
    :param observations, regularization
    :return: weights, ŷ, se, sse
    """
    if reg != 0.0:
        print('Regularized')
    I = np.identity(X.shape[1])
    w = (X.T * X + reg * I).I * X.T * y
    y_hat = X * w
    se = np.square(y_hat - y)
    sse = (y_hat - y).T * (y_hat - y)
    return w, y_hat, se, sse

#### Pseudo-inverse with a numpy Array

In [None]:
def regression_array(X, y, reg=0.0):
    """
    Computes the regression using numpy arrays
    :param observations:
    :return: weights, ŷ, sse
    """
    if reg != 0.0:
        print('Regularized')
    I = np.identity(X.shape[1])
    w = (np.linalg.inv(X.T @ X + reg * I) @ X.T) @ y
    # Or directly with pinv()
    # w = np.linalg.pinv(X) @ y
    y_hat = X @ w
    se = (y_hat - y) * (y_hat - y)
    sse = (y_hat - y).T @ (y_hat - y)
    return w, y_hat, se, sse

### Loading the Dataset

In [None]:
stat_en = open('../salammbo/salammbo_a_en.tsv').read().strip().split('\n')
stat_fr = open('../salammbo/salammbo_a_fr.tsv').read().strip().split('\n')
pattern = [('red', 's'), ('green', '^')]
lang = [None] * 2

### Computing the Regression Using Arrays

In [None]:
for i, stats in enumerate([stat_en, stat_fr]):
    observations = [[1] + list(map(float, obs.split())) for obs in stats]
    x_l = [obs[1] for obs in observations]
    y_l = [obs[2] for obs in observations]
    lang[i] = plt.scatter(x_l, y_l, color=pattern[i][0], marker=pattern[i][1])
    X = np.array(observations)[:, :-1]
    y = np.array(observations)[:, -1]
    w, y_hat, se, sse = regression_array(X, y)
    print('Language:', i)
    print('X:', X)
    print('y:', y)
    print('ŷ:', y_hat)
    print('Squared errors:', se)
    print("Weights", w.T)
    print("SSE", sse)
    plt.plot([min(x_l), max(x_l)],
             [([1, min(x_l)] @ w), ([1, max(x_l)] @ w)],
             color=pattern[i][0])
plt.title("Salammbô")
plt.xlabel("Letter count")
plt.ylabel("A count")
plt.legend((lang[0], lang[1]), ('English', 'French'), loc='lower right', scatterpoints=1)
plt.show()

### Computing the Regression Using Matrices

In [None]:
for i, stats in enumerate([stat_en, stat_fr]):
    observations = [[1] + list(map(float, obs.split())) for obs in stats]
    x_l = [obs[1] for obs in observations]
    y_l = [obs[2] for obs in observations]
    lang[i] = plt.scatter(x_l, y_l, color=pattern[i][0], marker=pattern[i][1])
    X = np.matrix(observations)[:, :-1]
    y = np.matrix(observations)[:, -1]
    w, y_hat, se, sse = regression_matrix(X, y)
    print('Language:', i)
    print('X:', X)
    print('y:', y)
    print('ŷ:', y_hat)
    print('Squared errors:', se)
    print("Weights", w.T)
    print("SSE", sse)

    w = np.array(w)
    plt.plot([min(x_l), max(x_l)],
             [([1, min(x_l)] @ w), ([1, max(x_l)] @ w)],
             color=pattern[i][0])
plt.title("Salammbô")
plt.xlabel("Letter count")
plt.ylabel("A count")
plt.legend((lang[0], lang[1]), ('English', 'French'), loc='lower right', scatterpoints=1)
plt.show()

### Singular Matrix

In [None]:
print('Trying regularization with a singular matrix')
# Creation of a singular matrix by duplicating a column
observations = [obs[0:-1] + [obs[-2]] + [obs[-1]] for obs in observations]
X = np.array(observations)[:, :-1]
y = np.array(observations)[:, -1]
print('X:', X)
print('y:', y)
try:
    regression_array(X, y)
except:
    print(np.linalg.linalg.LinAlgError)
    print("Singular matrix: Could not be inverted.")

### Singular Matrix with Regularization

Returns w, y_hat, squared errors, and the sum of squared errors

In [None]:
regression_array(X, y, reg=0.01)

### Pseudo-inverse with a Quasisingular Matrix

In [None]:
# Trying regularization with a quasi singular matrix
print('Trying regularization with a quasi singular matrix')
np.set_printoptions(precision=10)
observations[0][2] -= 0.000001
X = np.array(observations)[:, :-1]
y = np.array(observations)[:, -1]
print('X:', X)
print('y:', y)
# No regularization
regression_array(X, y)

### With Regularization

In [None]:
# With regularization
regression_array(X, y, reg=0.01)